In [None]:
#uselful snorkel sources:
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3a) keyword lookup
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3e) preprocessor


# Imports

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
import pickle
from tqdm import tqdm

from functools import lru_cache

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

import warnings

## Datasets

In [None]:
from load_datasets import load_asset_ds
from load_datasets import load_automets_ds
from load_datasets import load_benchls_ds
from load_datasets import load_britannica_ds
from load_datasets import load_dwikipedia_ds
from load_datasets import load_ewsewgmpm_ds
from load_datasets import load_ewsewturk_ds
from load_datasets import load_htss_ds
from load_datasets import load_hutssf_ds
from load_datasets import load_massalign_ds
from load_datasets import load_metaeval_ds
from load_datasets import load_mturksf_ds
from load_datasets import load_nnseval_ds
from load_datasets import load_onestopenglish_ds
from load_datasets import load_pwkp_ds
from load_datasets import load_questeval_ds
from load_datasets import load_semeval07_ds
from load_datasets import load_simpa_ds
from load_datasets import load_simpeval_ds
from load_datasets import load_sscorpus_ds
from load_datasets import load_turkcorpus_ds
from load_datasets import load_wikiauto_ds
from load_datasets import load_wikimanual_ds
from load_datasets import load_wikisplit_ds
from load_datasets import load_wikipediav1_ds
from load_datasets import load_wikipediav2_ds
from load_datasets import path_to_datasets

if not os.path.isdir(path_to_datasets):
    os.mkdir(path_to_datasets)

asset = load_asset_ds()
automets = load_automets_ds()
benchls = load_benchls_ds()
britannica = load_britannica_ds()
dwikipedia = load_dwikipedia_ds()
ewsewgmpm = load_ewsewgmpm_ds()
ewsewturk = load_ewsewturk_ds()
htss = load_htss_ds()
hutssf = load_hutssf_ds()
massalign = load_massalign_ds()
metaeval = load_metaeval_ds()
mturksf = load_mturksf_ds()
nnseval = load_nnseval_ds()
onestopenglish = load_onestopenglish_ds()
pwkp = load_pwkp_ds()
questeval = load_questeval_ds()
semeval07 = load_semeval07_ds()
simpa = load_simpa_ds()
simpeval = load_simpeval_ds()
sscorpus = load_sscorpus_ds()
turkcorpus = load_turkcorpus_ds()
wikiauto = load_wikiauto_ds()
wikimanual = load_wikimanual_ds()
wikisplit = load_wikisplit_ds()
wikipediav1 = load_wikipediav1_ds()
wikipediav2 = load_wikipediav2_ds()

combined_dataset = pd.concat([asset, automets, benchls, britannica, dwikipedia, ewsewgmpm, ewsewturk, htss, hutssf, massalign, metaeval, 
                              mturksf, nnseval, onestopenglish, pwkp, questeval, semeval07, simpa, simpeval, sscorpus, turkcorpus, 
                              wikiauto, wikimanual, wikisplit, wikipediav1, wikipediav2], axis=0).reset_index()

with open('/' + path_to_datasets + '/combined_dataset.pkl', 'wb') as f:
    pickle.dump(combined_dataset, f)

In [None]:
wikimanual.iloc[1052]['src']

In [None]:
wikimanual.iloc[1052]['simp']

In [None]:
metadata_ds = pd.read_excel('/workspace/datasets/English_Datasets.xlsx')

merged_ds = pd.merge(metadata_ds[['ds_id', 'Year', 'Target_Audience', 'Domain']], combined_dataset, on=['ds_id'])

#### Val-split

In [None]:
merged_ds['val_split'] = False

for ds in [britannica, htss, hutssf, onestopenglish, simpa]:
    
    val_ds = ds.sample(frac=1)[:100]
    for index, row in val_ds.iterrows():
        ds_id = row['ds_id']
        src_id = row['src_id']
        simp_id = row['simp_id']

        mask = (merged_ds['ds_id'] == ds_id) & (merged_ds['src_id'] == src_id) &  (merged_ds['simp_id'] == simp_id)
        merged_ds.loc[mask, 'val_split'] = True

with open('/workspace/datasets/final_combined_true_val_split.pkl', 'wb') as f:
    pickle.dump(merged_ds, f)

In [None]:
from load_datasets import add_global_index

df_w_index = add_global_index(merged_ds)

#### Meaning Preservation Datasets

In [None]:
from load_meaning_preservation import load_asset_ds
from load_meaning_preservation import load_metaeval_ds
from load_meaning_preservation import load_questeval_ds
from load_meaning_preservation import load_simpeval_ds

asset_mp = load_asset_ds()
metaeval_mp = load_metaeval_ds()
questeval_mp = load_questeval_ds()
simpeval_mp = load_simpeval_ds()

#### Human Labels Datasets

In [1]:
from load_simplicity import load_asset_ds
from load_simplicity import load_metaeval_ds
from load_simplicity import load_questeval_ds
from load_simplicity import load_simpeval_ds

#asset_simp = load_asset_ds()
#metaeval_simp = load_metaeval_ds()
#questeval_simp = load_questeval_ds()
simpeval_simp = load_simpeval_ds()

In [16]:
simpeval_simp

Unnamed: 0,ds_id,src,simp,simplicityScore,meaningScore,origin,inner_ds
0,SimpEval_22,"On the fifth day of flight, November 20, 2022,...","On November 20, 2022, the Orion spacecraft sta...",92.666667,91.666667,GPT-3-few-shot,"[simplikert_2022, simpeval_2022]"
1,SimpEval_22,The complainant claimed that he had not renoun...,The complaint said he did not renounce his Nep...,84.166667,33.333333,Human 2 Writing,"[simplikert_2022, simpeval_2022]"
2,SimpEval_22,The club said on social media that customers s...,The club announced on social media that custom...,72.166667,83.333333,GPT-3-zero-shot,"[simplikert_2022, simpeval_2022]"
3,SimpEval_22,The architecture of Winchester College is a di...,The architecture of Winchester College is a di...,57.833333,66.666667,Muss,"[simplikert_2022, simpeval_2022]"
4,SimpEval_22,Bone has published numerous books including Fa...,Bone has published many books including Fantas...,75.000000,83.333333,Muss,[simplikert_2022]
...,...,...,...,...,...,...,...
3363,SimpEval_22,MacGruber starts asking for simple objects to ...,Macgruber starts asking for simple objects to ...,82.400000,-1.000000,new_simple_wiki,[simpeval_past]
3364,SimpEval_22,MacGruber starts asking for simple objects to ...,MacGruber asks for objects to help defuse the ...,91.000000,-1.000000,asset.test.simp,[simpeval_past]
3365,SimpEval_22,MacGruber starts asking for simple objects to ...,MacGruber starts asking for simple objects. Ma...,46.200000,-1.000000,SEMoses_all,[simpeval_past]
3366,SimpEval_22,MacGruber starts asking for simple objects to ...,MacGruber starts asking for simple objects to ...,86.800000,-1.000000,new_asset.test.simp.second,[simpeval_past]


In [9]:
simpeval_simp['inner_ds']

0       [simplikert_2022, simpeval_2022]
1       [simplikert_2022, simpeval_2022]
2       [simplikert_2022, simpeval_2022]
3       [simplikert_2022, simpeval_2022]
4                      [simplikert_2022]
                      ...               
3363                     [simpeval_past]
3364                     [simpeval_past]
3365                     [simpeval_past]
3366                     [simpeval_past]
3367                     [simpeval_past]
Name: inner_ds, Length: 3368, dtype: object

In [19]:
count = 0
for e in simpeval_simp['inner_ds'].to_list():
    if 'simplicity_DA' in e:
        count += 1
    

print(count)


['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_2022', 'simpeval_2022']
['simplikert_20

In [None]:
combined_simp = pd.concat([asset_simp, metaeval_simp, questeval_simp, simpeval_simp], axis=0).reset_index()

with open('/workspace/datasets/human_simplification.pkl', 'wb') as f:
    pickle.dump(combined_simp, f)

## Meaning Preservation

In [None]:
with open("workspace/datasets/final_combined_with_index.pkl", 'rb') as f:
    combined_dataset = pickle.load(f)

combined_dataset.iloc[1636192]['src']

In [None]:
dupl_ind = []
for index, row in wikimanual.iterrows():
    if row['src'] == row['simp']:
        dupl_ind.append(index)

with open('/workspace/datasets/hlp_wikimanual_duplicated_indices.pkl', 'wb') as f:
    pickle.dump(dupl_ind, f)

In [None]:
with tqdm(total=len(combined_dataset)) as pbar:
    for i, row in combined_dataset.iterrows():
        if row['src'].strip() == "" or row['simp'].strip() == "":
            print(f"empty {row['ds_id']}")
            
        pbar.update(1)

In [None]:
from labeling_functions import get_all_lfs

In [None]:
sub_sample = combined_dataset[combined_dataset['val_split'] == True]
len(sub_sample)

In [None]:
ABSTAIN = -1
SIMPLE = 0
NOT_SIMPLE = 1
LOST_MEANING = 2

label_map = {-1: "ABSTAIN", 0: "SIMPLE", 1: "NOT_SIMPLE", 2: "LOST_MEANING"}

In [None]:
sub_sample['simplified_snt'] = sub_sample['simp']
sub_sample['source_snt'] = sub_sample['src']

#sub_sample.simplified_snt.iloc[577] = sub_sample.source_snt.iloc[577]

In [None]:
all_lfs = get_all_lfs()

In [None]:
len(all_lfs)

In [None]:
import time

In [None]:
lf_cnt = 1

for lf in all_lfs:
    print(f"Start: {lf.name} {lf_cnt}/{len(all_lfs)}")
    start = time.perf_counter()
    lf(sub_sample.iloc[100])
    end = time.perf_counter()
    print(f"Duration: {np.round(end-start, 2)} seconds")

    lf_cnt+=1

In [None]:
#find problematic samples:

applier = PandasLFApplier(all_lfs)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")

  for i in range(100, len(sub_sample), 100):
    print(i)
    labels = applier.apply(sub_sample[i:i+100])
    pickle.dump(labels, open(f"/workspace/datasets/temp_labels2_src{i}.pkl", "wb"))

In [None]:
pickle.dump(labels, open("/workspace/datasets/temp_labels.pkl", "wb"))

In [None]:


applier = PandasLFApplier(all_lfs)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  labels = applier.apply(sub_sample)

In [None]:
pickle.dump(labels, open("/workspace/datasets/sub_sample_labels.pkl", "wb"))

In [None]:
#show some stats for the results
from snorkel.labeling import LFAnalysis
lfa = LFAnalysis(L=labels, lfs=all_lfs).lf_summary()

In [None]:
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

In [None]:
lfa

In [None]:
test_l = labels[:,:250]

In [None]:
test_l.shape

In [None]:
#apply snorkel magic and automatically combine labels
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=labels, n_epochs=500, log_freq=5, seed=42, lr=0.001)

In [None]:
#snorkel generates noisy labels based on the aggregation of signals provided by the labeling functions
label_model_preds = label_model.predict(L=labels)
label_model_pred_probs = label_model.predict_proba(L=labels)

In [None]:
#explain label model
weights = label_model.get_weights()

for i in range(len(all_lfs)):
  print(f"{all_lfs[i].name} : {weights[i]}")

In [None]:
def check_instance(id):
  print(f"src_snt : {sub_sample.iloc[id]['source_snt']}")
  print(f"simp_snt : {sub_sample.iloc[id]['simplified_snt']}")
  print()
  print("Signals:")

  for i in range(len(all_lfs)):
    print(f"{all_lfs[i].name} : {label_map[labels[i][id]]}")
  print()
  print(f"complexity_score: {label_map[label_model_preds[id]]} ({label_model_pred_probs[id]})")
  print(f"gold label : {label_map[sub_sample.iloc[id]['gold_label']]}")

In [None]:
res = label_model.score(L=labels, Y=sub_sample['gold_label'], metrics=['accuracy', 'f1'], tie_break_policy="random")

print(f"{'Label Model Accuracy:':<25} {res['accuracy'] * 100:.1f}%")
print(f"{'Label Model F1-Score:':<25} {res['f1'] * 100:.1f}%")

In [None]:
res

In [None]:
LFAnalysis(labels, all_lfs).lf_summary(sub_sample['gold_label'].values)

In [None]:
for i in range(1):
  check_instance(i)
  print('-'*30)

In [None]:
#combine temp labels
import glob
import pickle
import numpy as np

temp_paths = sorted(glob.glob("/workspace/datasets/temp_labels2_simp*"))
templabels = []

for path in temp_paths:
    with open(path, "rb") as f:
        templabels.append(pickle.load(f))


In [None]:
eval_simp_labels = np.concatenate(templabels)

In [None]:
pickle.dump(eval_simp_labels, open("/workspace/datasets/eval_simp_labels.pkl", "wb"))