In [None]:
#uselful snorkel sources:
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3a) keyword lookup
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3e) preprocessor


## Setup

# Imports

In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
import pickle
from tqdm import tqdm

from functools import lru_cache

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

import warnings

## Datasets

In [None]:
from load_datasets import load_asset_ds
from load_datasets import load_automets_ds
from load_datasets import load_benchls_ds
from load_datasets import load_britannica_ds
from load_datasets import load_dwikipedia_ds
from load_datasets import load_ewsewgmpm_ds
from load_datasets import load_ewsewturk_ds
from load_datasets import load_htss_ds
from load_datasets import load_hutssf_ds
from load_datasets import load_massalign_ds
from load_datasets import load_metaeval_ds
from load_datasets import load_mturksf_ds
from load_datasets import load_nnseval_ds
from load_datasets import load_onestopenglish_ds
from load_datasets import load_pwkp_ds
from load_datasets import load_questeval_ds
from load_datasets import load_semeval07_ds
from load_datasets import load_simpa_ds
from load_datasets import load_simpeval_ds
from load_datasets import load_sscorpus_ds
from load_datasets import load_turkcorpus_ds
from load_datasets import load_wikiauto_ds
from load_datasets import load_wikimanual_ds
from load_datasets import load_wikisplit_ds
from load_datasets import load_wikipediav1_ds
from load_datasets import load_wikipediav2_ds
from load_datasets import path_to_datasets

if not os.path.isdir(path_to_datasets):
    os.mkdir(path_to_datasets)

asset = load_asset_ds()
automets = load_automets_ds()
benchls = load_benchls_ds()
britannica = load_britannica_ds()
dwikipedia = load_dwikipedia_ds()
ewsewgmpm = load_ewsewgmpm_ds()
ewsewturk = load_ewsewturk_ds()
htss = load_htss_ds()
hutssf = load_hutssf_ds()
massalign = load_massalign_ds()
metaeval = load_metaeval_ds()
mturksf = load_mturksf_ds()
nnseval = load_nnseval_ds()
onestopenglish = load_onestopenglish_ds()
pwkp = load_pwkp_ds()
questeval = load_questeval_ds()
semeval07 = load_semeval07_ds()
simpa = load_simpa_ds()
simpeval = load_simpeval_ds()
sscorpus = load_sscorpus_ds()
turkcorpus = load_turkcorpus_ds()
wikiauto = load_wikiauto_ds()
wikimanual = load_wikimanual_ds()
wikisplit = load_wikisplit_ds()
wikipediav1 = load_wikipediav1_ds()
wikipediav2 = load_wikipediav2_ds()

combined_dataset = pd.concat([asset, automets, benchls, britannica, dwikipedia, ewsewgmpm, ewsewturk, htss, hutssf, massalign, metaeval, 
                              mturksf, nnseval, onestopenglish, pwkp, questeval, semeval07, simpa, simpeval, sscorpus, turkcorpus, 
                              wikiauto, wikimanual, wikisplit, wikipediav1, wikipediav2], axis=0).reset_index()

with open('/' + path_to_datasets + '/combined_dataset.pkl', 'wb') as f:
    pickle.dump(combined_dataset, f)

In [None]:
metadata_ds = pd.read_excel('/workspace/datasets/English_Datasets.xlsx')

merged_ds = pd.merge(metadata_ds[['ds_id', 'Year', 'Target_Audience', 'Domain']], combined_dataset, on=['ds_id'])

#### Val-split

In [None]:
merged_ds['val_split'] = False

for ds in [britannica, htss, hutssf, onestopenglish, simpa]:
    
    val_ds = ds.sample(frac=1)[:100]
    for index, row in val_ds.iterrows():
        ds_id = row['ds_id']
        src_id = row['src_id']
        simp_id = row['simp_id']

        mask = (merged_ds['ds_id'] == ds_id) & (merged_ds['src_id'] == src_id) &  (merged_ds['simp_id'] == simp_id)
        merged_ds.loc[mask, 'val_split'] = True

with open('/workspace/datasets/final_combined_true_val_split.pkl', 'wb') as f:
    pickle.dump(merged_ds, f)

In [None]:
from load_datasets import add_global_index

df_w_index = add_global_index(merged_ds)

#### Meaning Preservation Datasets

In [None]:
from load_meaning_preservation import load_asset_ds
from load_meaning_preservation import load_metaeval_ds
from load_meaning_preservation import load_questeval_ds
from load_meaning_preservation import load_simpeval_ds

asset_mp = load_asset_ds()
metaeval_mp = load_metaeval_ds()
questeval_mp = load_questeval_ds()
simpeval_mp = load_simpeval_ds()

#### Human Labels Datasets

In [None]:
from load_simplicity import load_asset_ds
from load_simplicity import load_metaeval_ds
from load_simplicity import load_questeval_ds
from load_simplicity import load_simpeval_ds

asset_simp = load_asset_ds()
metaeval_simp = load_metaeval_ds()
questeval_simp = load_questeval_ds()
simpeval_simp = load_simpeval_ds()

In [None]:
combined_simp = pd.concat([asset_simp, metaeval_simp, questeval_simp, simpeval_simp], axis=0).reset_index()

with open('/workspace/datasets/human_simplification.pkl', 'wb') as f:
    pickle.dump(combined_simp, f)

## Meaning Preservation

In [2]:
with open("workspace/datasets/final_combined_with_index.pkl", 'rb') as f:
    combined_dataset = pickle.load(f)

In [3]:
combined_dataset

Unnamed: 0,ds_id,Year,Target_Audience,Domain,index,src,src_id,simp,simp_id,label,origin,granularity,duplicated,topic,src_title,simp_title,similarity,topics,val_split
0,ASSET,2020,general_target_audience,encyclopedic,ASSET__0__0,One side of the armed conflicts is composed ma...,0,On one side of the conflicts are the Sudanese ...,0,test,annotator_0,sentence,False,,,,,,False
1,ASSET,2020,general_target_audience,encyclopedic,ASSET__1__1,"Jeddah is the principal gateway to Mecca, Isla...",1,Muslims are required to visit Mecca once in th...,1,test,annotator_0,sentence,False,,,,,,False
2,ASSET,2020,general_target_audience,encyclopedic,ASSET__2__2,The Great Dark Spot is thought to represent a ...,2,The dark spot on Ne;tune may be a hole in the ...,2,test,annotator_0,sentence,False,,,,,,False
3,ASSET,2020,general_target_audience,encyclopedic,ASSET__3__3,"His next work, Saturday, follows an especially...",3,Next Saturday is a presentation of a successfu...,3,test,annotator_0,sentence,False,,,,,,False
4,ASSET,2020,general_target_audience,encyclopedic,ASSET__4__4,"The tarantula, the trickster character, spun a...",4,The tarantula spun a black cord and attached i...,4,test,annotator_0,sentence,False,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006966,WikiSplit,2018,general_target_audience,encyclopedic,WikiSplit__1004939__1004939,"Zahm Hall, a male dormitory at Notre Dame, is ...",1004939,"Zahm Hall, a male dormitory at Notre Dame, is ...",1004939,validation,,sentence,False,,,,,,False
3006967,WikiSplit,2018,general_target_audience,encyclopedic,WikiSplit__1004940__1004940,"Zahn was first diagnosed in the late 1990s, an...",1004940,Zahn was first diagnosed in the late 1990s. Th...,1004940,validation,,sentence,False,,,,,,False
3006968,WikiSplit,2018,general_target_audience,encyclopedic,WikiSplit__1004941__1004941,Zeinab Elobeid Yousif (1952 -- 19 March 2016) ...,1004941,Zeinab Elobeid Yousif (1952 -- 19 March 2016) ...,1004941,validation,,sentence,False,,,,,,False
3006969,WikiSplit,2018,general_target_audience,encyclopedic,WikiSplit__1004942__1004942,"Zen Peacemakers have a 34 - acre campus, the M...",1004942,"Zen Peacemakers have a 34 - acre campus, the M...",1004942,validation,,sentence,False,,,,,,False


In [None]:
with tqdm(total=len(combined_dataset)) as pbar:
    for i, row in combined_dataset.iterrows():
        if row['src'].strip() == "" or row['simp'].strip() == "":
            print(f"empty {row['ds_id']}")
            
        pbar.update(1)

In [7]:
from labeling_functions import get_all_lfs

2023-11-06 13:31:21.129056: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 13:31:21.669979: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 13:31:21.671972: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


resources get initialised


Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [14]:
sub_sample = combined_dataset[combined_dataset['val_split'] == True]
len(sub_sample)

500

In [15]:
ABSTAIN = -1
SIMPLE = 0
NOT_SIMPLE = 1
LOST_MEANING = 2

label_map = {-1: "ABSTAIN", 0: "SIMPLE", 1: "NOT_SIMPLE", 2: "LOST_MEANING"}

In [16]:
sub_sample['simplified_snt'] = sub_sample['simp']
sub_sample['source_snt'] = sub_sample['src']

#sub_sample.simplified_snt.iloc[577] = sub_sample.source_snt.iloc[577]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_sample['simplified_snt'] = sub_sample['simp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_sample['source_snt'] = sub_sample['src']


In [17]:
all_lfs = get_all_lfs()

In [18]:
len(all_lfs)

895

In [19]:
import time

In [21]:
lf_cnt = 1

for lf in all_lfs:
    print(f"Start: {lf.name} {lf_cnt}/{len(all_lfs)}")
    start = time.perf_counter()
    lf(sub_sample.iloc[100])
    end = time.perf_counter()
    print(f"Duration: {np.round(end-start, 2)} seconds")

    lf_cnt+=1

Start: lf_words_cnt_wcount=3_SIMPLE 1/895
Duration: 0.67 seconds
Start: lf_words_cnt_wcount=4_SIMPLE 2/895
Duration: 0.74 seconds
Start: lf_words_cnt_wcount=5_SIMPLE 3/895
Duration: 1.11 seconds
Start: lf_words_cnt_wcount=6_SIMPLE 4/895
Duration: 0.7 seconds
Start: lf_words_cnt_wcount=7_SIMPLE 5/895
Duration: 0.71 seconds
Start: lf_words_cnt_wcount=8_SIMPLE 6/895
Duration: 0.74 seconds
Start: lf_words_cnt_wcount=9_SIMPLE 7/895
Duration: 0.79 seconds
Start: lf_words_cnt_wcount=10_SIMPLE 8/895
Duration: 0.76 seconds
Start: lf_words_cnt_wcount=11_SIMPLE 9/895
Duration: 0.85 seconds
Start: lf_words_cnt_wcount=12_SIMPLE 10/895
Duration: 0.73 seconds
Start: lf_words_cnt_wcount=13_SIMPLE 11/895
Duration: 0.99 seconds
Start: lf_words_cnt_wcount=14_SIMPLE 12/895
Duration: 0.78 seconds
Start: lf_words_cnt_wcount=15_NOT_SIMPLE 13/895
Duration: 0.73 seconds
Start: lf_words_cnt_wcount=16_NOT_SIMPLE 14/895
Duration: 0.96 seconds
Start: lf_words_cnt_wcount=17_NOT_SIMPLE 15/895
Duration: 0.89 seconds


KeyboardInterrupt: 

In [None]:
#find problematic samples:

applier = PandasLFApplier(all_lfs)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")

  for i in range(100, len(sub_sample), 100):
    print(i)
    labels = applier.apply(sub_sample[i:i+100])
    pickle.dump(labels, open(f"/workspace/datasets/temp_labels2_src{i}.pkl", "wb"))

In [None]:
pickle.dump(labels, open("/workspace/datasets/temp_labels.pkl", "wb"))

In [None]:


applier = PandasLFApplier(all_lfs)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  labels = applier.apply(sub_sample)

In [None]:
pickle.dump(labels, open("/workspace/datasets/sub_sample_labels.pkl", "wb"))

In [None]:
#show some stats for the results
from snorkel.labeling import LFAnalysis
lfa = LFAnalysis(L=labels, lfs=all_lfs).lf_summary()

In [None]:
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

In [None]:
lfa

In [None]:
test_l = labels[:,:250]

In [None]:
test_l.shape

In [None]:
#apply snorkel magic and automatically combine labels
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=labels, n_epochs=500, log_freq=5, seed=42, lr=0.001)

In [None]:
#snorkel generates noisy labels based on the aggregation of signals provided by the labeling functions
label_model_preds = label_model.predict(L=labels)
label_model_pred_probs = label_model.predict_proba(L=labels)

In [None]:
#explain label model
weights = label_model.get_weights()

for i in range(len(all_lfs)):
  print(f"{all_lfs[i].name} : {weights[i]}")

In [None]:
def check_instance(id):
  print(f"src_snt : {sub_sample.iloc[id]['source_snt']}")
  print(f"simp_snt : {sub_sample.iloc[id]['simplified_snt']}")
  print()
  print("Signals:")

  for i in range(len(all_lfs)):
    print(f"{all_lfs[i].name} : {label_map[labels[i][id]]}")
  print()
  print(f"complexity_score: {label_map[label_model_preds[id]]} ({label_model_pred_probs[id]})")
  print(f"gold label : {label_map[sub_sample.iloc[id]['gold_label']]}")

In [None]:
res = label_model.score(L=labels, Y=sub_sample['gold_label'], metrics=['accuracy', 'f1'], tie_break_policy="random")

print(f"{'Label Model Accuracy:':<25} {res['accuracy'] * 100:.1f}%")
print(f"{'Label Model F1-Score:':<25} {res['f1'] * 100:.1f}%")

In [None]:
res

In [None]:
LFAnalysis(labels, all_lfs).lf_summary(sub_sample['gold_label'].values)

In [None]:
for i in range(1):
  check_instance(i)
  print('-'*30)

In [None]:
#combine temp labels
import glob
import pickle
import numpy as np

temp_paths = sorted(glob.glob("/workspace/datasets/temp_labels2_simp*"))
templabels = []

for path in temp_paths:
    with open(path, "rb") as f:
        templabels.append(pickle.load(f))


In [None]:
eval_simp_labels = np.concatenate(templabels)

In [None]:
pickle.dump(eval_simp_labels, open("/workspace/datasets/eval_simp_labels.pkl", "wb"))