In [None]:
#uselful snorkel sources:
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3a) keyword lookup
#https://www.snorkel.org/use-cases/01-spam-tutorial#2-writing-labeling-functions-lfs 3e) preprocessor


## Setup

# Imports

In [11]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
[0m

In [None]:
!pip install ipywidgets

In [1]:
import numpy as np
import pandas as pd
import tqdm
from tqdm import tqdm

from functools import lru_cache

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

import warnings

  from .autonotebook import tqdm as notebook_tqdm


## Datasets

In [4]:
from load_datasets import load_htss_ds
from load_datasets import load_ebbe_ds
from load_datasets import load_simpa_ds
from load_datasets import load_pwkp_ds
from load_datasets import load_rnd_st_ds

htss_dataset = load_htss_ds()
pwkp_dataset = load_pwkp_ds()
ebbe_dataset = load_ebbe_ds()
simpa_dataset_ls, simpa_dataset_ss = load_simpa_ds()
sub_sample = load_rnd_st_ds()


## Meaning Preservation

In [6]:
from usb_utils import remove_stop_words
from usb_utils import check_sent_sim
from usb_utils import core_preserved_meaning_max_depth_5


In [7]:
# check impact of stopwords on similarity
print(check_sent_sim(str(htss_dataset.iloc[3].orig_snt),str(htss_dataset.iloc[3].simp)))
print(check_sent_sim(remove_stop_words(htss_dataset.iloc[3].orig_snt),remove_stop_words(htss_dataset.iloc[3].simp)))

0.7352920675600417
0.49260161873098735


In [None]:
# test with simplified paragraph
print(check_sent_sim(str(htss_dataset.iloc[3].orig_snt),str(htss_dataset.iloc[3].simp)))
print(core_preserved_meaning_max_depth_5(str(htss_dataset.iloc[3].orig_snt),str(htss_dataset.iloc[3].simp)))

In [None]:
# test with simplified paragraph
print(check_sent_sim(str(htss_dataset.iloc[3].orig_snt),str(htss_dataset.iloc[3].simp)))
print(core_preserved_meaning_max_depth_5(remove_stop_words(htss_dataset.iloc[3].orig_snt),remove_stop_words(htss_dataset.iloc[3].simp)))

### comparison of effectiveness and efficiency: max depth, stopwords

In [None]:
tqdm.pandas()
pwkp_dataset_meaning_test = pwkp_dataset.sample(100)

In [None]:
# no max depth
pwkp_dataset_meaning_test["meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
pwkp_dataset_meaning_test["core_meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: core_preserved_meaning(row["orig_snt"], row["simp"])[0], axis=1)
print(pwkp_dataset_meaning_test.core_meaning_pres.describe())
print(pwkp_dataset_meaning_test.meaning_pres.describe())

In [None]:
# max depth 5
pwkp_dataset_meaning_test2 = pwkp_dataset_meaning_test
pwkp_dataset_meaning_test2["meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
pwkp_dataset_meaning_test2["core_meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)
print(pwkp_dataset_meaning_test2.core_meaning_pres.describe())
print(pwkp_dataset_meaning_test2.meaning_pres.describe())

In [None]:
# no max depth, stopwords removed
pwkp_dataset_meaning_test["meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: check_sent_sim(remove_stop_words(row["orig_snt"]), remove_stop_words(row["simp"])), axis=1)
pwkp_dataset_meaning_test["core_meaning_pres"] = pwkp_dataset_meaning_test.progress_apply(lambda row: core_preserved_meaning(remove_stop_words(row["orig_snt"]), remove_stop_words(row["simp"]))[0], axis=1)
print(pwkp_dataset_meaning_test.core_meaning_pres.describe())
print(pwkp_dataset_meaning_test.meaning_pres.describe())

In [None]:
from scipy.stats import ttest_ind
#perform independent two sample t-test
ttest_ind(pwkp_dataset_meaning_test['core_meaning_pres'], pwkp_dataset_meaning_test['meaning_pres'])

In [None]:
# shuffled randomly
df_random = ebbe_dataset[:104]
l = df_random.simp.to_list()
np.random.shuffle(l)
df_random["simp"] = l


df_random["meaning_pres"] = df_random.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
df_random["core_meaning_pres"] = df_random.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)

# shuffled in topic
df_in_topic_random = ebbe_dataset.loc[ebbe_dataset["label"] == 'baghdad-hum' ]
l = df_in_topic_random.simp.to_list()
np.random.shuffle(l)
df_in_topic_random["simp"] = l
for city in ['bangkok-hum', 'budapest-hum', 'buenos-hum']:
  df_t2  = ebbe_dataset.loc[ebbe_dataset["label"] == city ]
  l = df_t2.simp.to_list()
  np.random.shuffle(l)
  df_t2["simp"] = l
  df_in_topic_random = df_in_topic_random.append([df_t2],ignore_index=True)


df_in_topic_random["meaning_pres"] = df_in_topic_random.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
df_in_topic_random["core_meaning_pres"] = df_in_topic_random.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)

# not shuffled
df_in_topic = ebbe_dataset.loc[ebbe_dataset["label"] == 'baghdad-hum' ]
for city in ['bangkok-hum', 'budapest-hum', 'buenos-hum']:
  df_t2  = ebbe_dataset.loc[ebbe_dataset["label"] == city ]
  df_in_topic = df_in_topic.append([df_t2],ignore_index=True)


df_in_topic["meaning_pres"] = df_in_topic.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
df_in_topic["core_meaning_pres"] = df_in_topic.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)

In [None]:
pwkp_dataset.sample(104)

In [None]:
# pwkp dataset
df_pwkp = pwkp_dataset.sample(104)

df_pwkp["meaning_pres"] = df_pwkp.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
df_pwkp["core_meaning_pres"] = df_pwkp.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)

In [None]:
# simpa dataset
df_simpa = simpa_dataset_ls.sample(104)

df_simpa["meaning_pres"] = df_simpa.progress_apply(lambda row: check_sent_sim(row["orig_snt"], row["simp"]), axis=1)
df_simpa["core_meaning_pres"] = df_simpa.progress_apply(lambda row: core_preserved_meaning_max_depth_5(row["orig_snt"], row["simp"])[0], axis=1)

In [None]:
df = pd.DataFrame({"ebbe_rnd":df_random.core_meaning_pres.to_list(), "ebbe_in_topic_rnd":df_in_topic_random.core_meaning_pres.to_list(), "ebbe_reg":df_in_topic.core_meaning_pres.to_list(),  "pwkp_reg":df_pwkp.core_meaning_pres.to_list(),  "simpa_reg":df_simpa.core_meaning_pres.to_list()})

In [None]:
df.boxplot()

In [None]:
sep = []
for i in range(0,25):
  sep.append(i*0.04)
df.plot.kde(ind=sep)

In [None]:
sep = []
for i in range(0,25):
  sep.append(i*0.04)
df.plot.kde(ind=sep)

In [None]:
print((ttest_ind(df['pwkp_regular'], df['ebbe_regular'])))
print((ttest_ind(df['ebbe_random'], df['ebbe_regular'])))
print(ttest_ind(df['ebbe_random'], df['ebbe_in_topic_random']))
print(ttest_ind(df['ebbe_in_topic_random'], df['ebbe_regular']))

## LF Assembly

In [12]:
from labeling_functions import get_all_lfs

resources get initialised


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading: 100%|██████████| 414/414 [00:00<00:00, 3.92MB/s]
Downloading: 100%|██████████| 208k/208k [00:00<00:00, 1.15MB/s]
Downloading: 100%|██████████| 634M/634M [00:39<00:00, 17.0MB/s] 
Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModuleNotFoundError: No java install detected. Please install java to use language-tool-python.

In [None]:
ABSTAIN = -1
SIMPLE = 0
NOT_SIMPLE = 1
LOST_MEANING = 2

label_map = {-1: "ABSTAIN", 0: "SIMPLE", 1: "NOT_SIMPLE", 2: "LOST_MEANING"}

In [None]:
all_lfs = get_all_lfs()

applier = PandasLFApplier(all_lfs)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  labels = applier.apply(sub_sample)

In [None]:
#show some stats for the results
from snorkel.labeling import LFAnalysis
LFAnalysis(L=labels, lfs=all_lfs).lf_summary()

In [None]:
test_l = labels[:,:250]

In [None]:
test_l.shape

In [None]:
#apply snorkel magic and automatically combine labels
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=labels, n_epochs=500, log_freq=5, seed=42, lr=0.001)

In [None]:
#snorkel generates noisy labels based on the aggregation of signals provided by the labeling functions
label_model_preds = label_model.predict(L=labels)
label_model_pred_probs = label_model.predict_proba(L=labels)

In [None]:
#explain label model
weights = label_model.get_weights()

for i in range(len(all_lfs)):
  print(f"{all_lfs[i].name} : {weights[i]}")

In [None]:
def check_instance(id):
  print(f"src_snt : {sub_sample.iloc[id]['source_snt']}")
  print(f"simp_snt : {sub_sample.iloc[id]['simplified_snt']}")
  print()
  print("Signals:")

  for i in range(len(all_lfs)):
    print(f"{all_lfs[i].name} : {label_map[labels[i][id]]}")
  print()
  print(f"complexity_score: {label_map[label_model_preds[id]]} ({label_model_pred_probs[id]})")
  print(f"gold label : {label_map[sub_sample.iloc[id]['gold_label']]}")

In [None]:
res = label_model.score(L=labels, Y=sub_sample['gold_label'], metrics=['accuracy', 'f1'], tie_break_policy="random")

print(f"{'Label Model Accuracy:':<25} {res['accuracy'] * 100:.1f}%")
print(f"{'Label Model F1-Score:':<25} {res['f1'] * 100:.1f}%")

In [None]:
res

In [None]:
LFAnalysis(labels, all_lfs).lf_summary(sub_sample['gold_label'].values)

In [None]:
for i in range(1):
  check_instance(i)
  print('-'*30)