## Datset setup

In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
import pickle
from tqdm import tqdm

from functools import lru_cache

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

import warnings

from load_datasets import load_asset_ds
from load_datasets import load_automets_ds
from load_datasets import load_benchls_ds
from load_datasets import load_britannica_ds
from load_datasets import load_dwikipedia_ds
from load_datasets import load_ewsewgmpm_ds
from load_datasets import load_ewsewturk_ds
from load_datasets import load_htss_ds
from load_datasets import load_hutssf_ds
from load_datasets import load_massalign_ds
from load_datasets import load_metaeval_ds
from load_datasets import load_mturksf_ds
from load_datasets import load_nnseval_ds
from load_datasets import load_onestopenglish_ds
from load_datasets import load_pwkp_ds
from load_datasets import load_questeval_ds
from load_datasets import load_semeval07_ds
from load_datasets import load_simpa_ds
from load_datasets import load_simpeval_ds
from load_datasets import load_sscorpus_ds
from load_datasets import load_turkcorpus_ds
from load_datasets import load_wikiauto_ds
from load_datasets import load_wikimanual_ds
from load_datasets import load_wikisplit_ds
from load_datasets import load_wikipediav1_ds
from load_datasets import load_wikipediav2_ds
from load_datasets import path_to_datasets

from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

if not os.path.isdir(path_to_datasets):
    os.mkdir(path_to_datasets)

asset = load_asset_ds()
automets = load_automets_ds()
benchls = load_benchls_ds()
britannica = load_britannica_ds()
dwikipedia = load_dwikipedia_ds()
ewsewgmpm = load_ewsewgmpm_ds()
ewsewturk = load_ewsewturk_ds()
htss = load_htss_ds()
hutssf = load_hutssf_ds()
massalign = load_massalign_ds()
metaeval = load_metaeval_ds()
mturksf = load_mturksf_ds()
nnseval = load_nnseval_ds()
onestopenglish = load_onestopenglish_ds()
pwkp = load_pwkp_ds()
questeval = load_questeval_ds()
# semeval07 = load_semeval07_ds()
simpa = load_simpa_ds()
simpeval = load_simpeval_ds()
sscorpus = load_sscorpus_ds()
turkcorpus = load_turkcorpus_ds()
# wikiauto = load_wikiauto_ds()
wikimanual = load_wikimanual_ds()


combined_dataset = pd.concat([asset, automets, benchls, britannica, dwikipedia, ewsewgmpm, ewsewturk, htss, hutssf, massalign, metaeval, 
                              mturksf, nnseval, onestopenglish, pwkp, questeval, simpa, simpeval, sscorpus, turkcorpus, 
                              wikimanual], axis=0).reset_index()

with open('/' + path_to_datasets + '/combined_dataset.pkl', 'wb') as f:
    pickle.dump(combined_dataset, f)

In [2]:
test_df = combined_dataset.sample(100)
test_df = test_df.rename(columns={"simp": "simplified_snt"})

## Preprocessors and LF setup

Takes a while, downloads all prerequisites for the LFs.
Fabian via WLAN: ±17 min

In [3]:
import numpy as np
import pandas as pd

import warnings

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.preprocess import preprocessor

from wordfreq import word_frequency

import spacy
from spacy_syllables import SpacySyllables
import spacy_universal_sentence_encoder

from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

import textstat
from PassivePySrc import PassivePy
from Levenshtein import distance

import language_tool_python
passivepy = PassivePy.PassivePyAnalyzer(spacy_model = "en_core_web_sm")

from qanom.nominalization_detector import NominalizationDetector
nom_detector = NominalizationDetector()

ABSTAIN = -1
SIMPLE = 0
NOT_SIMPLE = 1
LOST_MEANING = 2

label_map = {5: "ABSTAIN", 0: "SIMPLE", 1: "NOT_SIMPLE", 2: "LOST_MEANING"}

#resources
aoa_dic = None
concreteness_dic = None
imageability_dic = None
predictor = None
tool_us = None
tool_gb = None

def init():
  print("resources get initialised")

  global aoa_dic
  global concreteness_dic 
  global imageability_dic 
  global predictor 
  global tool_us 
  global tool_gb
  global ox5k_a
  global academic_word_list

init()


  warn(f"Failed to load image Python extension: {e}")
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


resources get initialised


In [4]:

#preprocessors
def entities_in_list_of_tokens(l_tokens):
  entities = []
  for i, a in enumerate(l_tokens):
    if a.ent_iob_ == "B":
      s = a.text
      t = i
      while len(l_tokens)>t+1 and l_tokens[t+1].ent_iob_ == "I":
        s = s+" "+l_tokens[t+1].text
        t += 1
      entities.append(s)
  return(entities)

def paragraph_sep(doc):
  c_list = []
  f_list = []
  for token in doc:
    if token.tag_ != "_SP":
      c_list.append(token)
    else:
      f_list.append(c_list)
      c_list = [token]
  f_list.append(c_list)
  return(f_list)


@preprocessor(memoize=True)
def spacy_nlp(x):
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe("syllables", after="tagger")
  x.pipeline_components = nlp.pipe_names
  x.simp_text = x.simplified_snt

  # simplified
  doc = nlp(x.simplified_snt)
  x.simp_syllables = [token._.syllables for token in doc]
  x.simp_syllables_cnt = [token._.syllables_count for token in doc]
  x.simp_tokens = [token.text for token in doc]
  x.simp_tokens_data = [token for token in doc] #token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop
  # list of pos tags: https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/13-POS-Keywords.html
  x.simp_words = [token.text for token in doc if token.pos_ != 'PUNCT']
  x.simp_sentences = [s.text for s in doc.sents]
  x.simp_doc = doc
  x.simp_entities = [e.text for e in doc.ents]

  return x

@preprocessor(memoize=True)
def spacy_nlp_paragraph(x):
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe("syllables", after="tagger")
  x.pipeline_components = nlp.pipe_names
  x.simp_text = x.simplified_snt

  # simplified
  doc = nlp(x.simplified_snt)
  x.simp_syllables = [token._.syllables for token in doc]
  x.simp_syllables_cnt = [token._.syllables_count for token in doc]
  x.simp_tokens = [token.text for token in doc]
  x.simp_tokens_data = [token for token in doc] #token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop
  # list of pos tags: https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/13-POS-Keywords.html
  x.simp_words = [token.text for token in doc if token.pos_ != 'PUNCT']
  x.simp_sentences = [s.text for s in doc.sents]
  x.simp_doc = doc
  x.simp_entities = [e.text for e in doc.ents]
  x.simp_paragraph_tokens_data = paragraph_sep(doc)

  return x

@preprocessor(memoize=True)
def spacy_universal_embeddings(x):
  sent_encoder = spacy_universal_sentence_encoder.load_model('en_use_lg')
  x.simp_universal_doc = sent_encoder(x.simplified_snt)

  return x


## Test area

In [5]:
def test_lf(labeling_function, data=test_df):
    test_df["test"] = test_df.progress_apply(lambda row: lf(row) ,axis=1)
    return test_df

In [6]:
# ENTER YOUR LABELING FUNCTIONS HERE:::

# Fabian : high percentage of vocabulary learned in initial stages of foreign language learning~\cite{tanaka} $\rightarrow$ language proficiency test
def perc_vocab_initial_forLang_learn(x, thresh, label):
  ratio = len([w for w in x.simp_doc if w.text.lower() in ox5k_a])/len(x.simp_tokens)
  if label == SIMPLE:
      if ratio <= thresh:
        return label
      else:
        return ABSTAIN
  else:
    if ratio > thresh:
      return label
    else:
      return ABSTAIN

def make_perc_vocab_initial_forLang_learn_lf(thresh, label=SIMPLE):

    return LabelingFunction(
        name=f"perc_vocab_initial_forLang_learn_{label}_{thresh}",
        f=perc_vocab_initial_forLang_learn,
        resources=dict(thresh=thresh, label=label),
        pre=[spacy_nlp]
    )


def words_per_sentence(x, w_cnt, label):
    avg_cnt = len(x.simp_words)/len(x.simp_sentences)

    if label == SIMPLE:
      if avg_cnt <= w_cnt:
        return label
      else:
        return ABSTAIN
    else:
      if avg_cnt > w_cnt:
        return label
      else:
        return ABSTAIN
# bjoern: few words per sentence~\cite{simpa}
def make_word_cnt_lf(w_cnt, label=SIMPLE):

    return LabelingFunction(
        name=f"lf_words_cnt_wcount={w_cnt}_{label_map[label]}",
        f=words_per_sentence,
        resources=dict(w_cnt=w_cnt, label=label),
        pre=[spacy_nlp]
    )


### test Thresholds

In [7]:
lf = make_word_cnt_lf(10)

In [8]:
df_res = test_lf(lf)

100%|██████████| 100/100 [00:26<00:00,  3.74it/s]


In [11]:
df_res[["simplified_snt", "test"]]

Unnamed: 0,simplified_snt,test
208510,He tried to win the Democratic nomination for ...,-1
736087,Mandaue was established as a mission village (...,-1
951037,Nickelodeon became known for its iconic green ...,-1
312871,Claudia Puig of USA Today said `` for a movie ...,-1
644273,The Ohio State College of Medicine is on the s...,-1
...,...,...
321009,Some of the few remaining unevacuated Alderney...,-1
709361,"He next showed up in 1961 in Cleveland, where ...",-1
356995,Australia at the Olympics is a history which i...,0
95154,there is more than one place called wilby in e...,-1


In [17]:
df_res["test"].value_counts()

test
-1    68
 0    32
Name: count, dtype: int64