In [148]:
import pickle
import pandas as pd
from labeling_functions import get_all_lfs
from snorkel.labeling import PandasLFApplier
from tqdm import tqdm
import numpy as np
from multiprocessing import Process, Manager

In [149]:
def apply_lfs(df, lfs, chunk_start, size, label_dic):
    applier = PandasLFApplier(lfs)
    labels = applier.apply(df[chunk_start:chunk_start+size], progress_bar=False)
    label_dic[chunk_start] = labels

In [150]:
def preprocess_dataset(dataset, sel_ds_id, app_type="simp"):
    
    if sel_ds_id == "eval":
        dataset = dataset[dataset['val_split'] == True]
    else:
        dataset = dataset[dataset['ds_id'] == sel_ds_id]        

    dataset['simplified_snt'] = dataset[app_type]
    dataset['source_snt'] = dataset['src']

    return dataset

In [151]:
def regen_labels(label_dic):
    labels = label_dic[sorted(label_dic.keys())[0]].squeeze()
    for key in sorted(label_dic.keys())[1:]:
        labels = np.concatenate((labels, label_dic[key].squeeze()))
    return labels

In [152]:
def result_for_lf(labels):
    abstain_cnt = np.sum(labels == -1)
    simple_cnt = np.sum(labels == 0)
    complex_cnt = np.sum(labels == 1)
    return abstain_cnt, simple_cnt, complex_cnt

In [153]:
with open("/workspace/datasets/final_combined_with_index_with_arts", 'rb') as f:
    dataset = pickle.load(f)

In [154]:
ds_to_test_on = "arts300"
all_lfs = get_all_lfs()
test_dataset = preprocess_dataset(dataset, ds_to_test_on)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['simplified_snt'] = dataset[app_type]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['source_snt'] = dataset['src']


In [155]:
num_processes = 5
batch_size = 20
all_batch_indices = list(range(0, len(test_dataset), batch_size))

manager = Manager()
label_dic = manager.dict()
procs = []


with tqdm(total = len(all_batch_indices)) as pbar:
    while len(all_batch_indices) > 0:
        for i in range(num_processes):
            current_index = all_batch_indices.pop()
            proc = Process(target=apply_lfs, args=(test_dataset, all_lfs, current_index , batch_size, label_dic))
            procs.append(proc)
            proc.start()
            pbar.update(1)
            
        for proc in procs:
            proc.join()

    label_dic = dict(label_dic)

100%|██████████| 15/15 [02:06<00:00,  8.46s/it]


In [156]:
full_labels = regen_labels(label_dic)

In [157]:
lf_stat_dic = {}

for i in range(len(all_lfs)):
    lf_stat_dic[all_lfs[i].name] = result_for_lf(full_labels[:,i])
    #print(all_lfs[i].name, result_for_lf(full_labels[:,i]))

In [158]:
lf_stats = pd.DataFrame(lf_stat_dic).T
lf_stats.columns = ['Abstain', "Simple", "Complex"]

In [159]:
lf_stats.to_excel("/workspace/lf_stats.xlsx")

In [160]:
lf_stats

Unnamed: 0,Abstain,Simple,Complex
lf_words_cnt_wcount=3_SIMPLE,299,1,0
lf_words_cnt_wcount=4_SIMPLE,297,3,0
lf_words_cnt_wcount=5_SIMPLE,296,4,0
lf_words_cnt_wcount=6_SIMPLE,292,8,0
lf_words_cnt_wcount=7_SIMPLE,285,15,0
...,...,...,...
few_noun_phrases_thres=16_label=1,294,0,6
few_noun_phrases_thres=17_label=1,296,0,4
few_noun_phrases_thres=18_label=1,298,0,2
few_noun_phrases_thres=19_label=1,298,0,2


In [162]:
#how many bad lfs are there?
lf_stats[lf_stats['Abstain'] == 300]

Unnamed: 0,Abstain,Simple,Complex
lf_infrequent_words_cnt=1_dog_SIMPLE,300,0,0
lf_infrequent_words_cnt=1_fish_SIMPLE,300,0,0
entity_token_ratio_text_label=1_thresh=0.5,300,0,0
low_prop_long_words_syllables_long=2_prop=0.01_label=0,300,0,0
low_prop_long_words_syllables_long=2_prop=0.025_label=0,300,0,0
...,...,...,...
num_sents_num_thres=9_label=1,300,0,0
num_sents_num_thres=10_label=1,300,0,0
num_sents_num_thres=11_label=1,300,0,0
few_modifiers_thres=12_label=1,300,0,0


In [163]:
lf_stats[lf_stats['Simple'] == 300]

Unnamed: 0,Abstain,Simple,Complex
entity_token_ratio_text_label=0_thresh=0.75,0,300,0
low_prop_long_words_syllables_long=3_prop=0.4_label=0,0,300,0
low_prop_long_words_syllables_long=4_prop=0.4_label=0,0,300,0
low_prop_long_words_letters_long=9_prop=0.4_label=0,0,300,0
lf_median_age_of_acquisition=8_SIMPLE,0,300,0
...,...,...,...
freq_negations_ratio_label=0_thresh=0.15,0,300,0
freq_negations_ratio_label=0_thresh=0.2,0,300,0
freq_negations_ratio_label=0_thresh=0.25,0,300,0
few_conjunctions_ratio_thres=0.2_label=0,0,300,0


In [164]:
lf_stats[lf_stats['Complex'] == 300]

Unnamed: 0,Abstain,Simple,Complex
lf_avg_age_of_acquisition=3_NOT_SIMPLE,0,0,300
lf_avg_age_of_acquisition=3.5_NOT_SIMPLE,0,0,300
lf_avg_concreteness=4_NOT_SIMPLE,0,0,300
lf_avg_concreteness=4.5_NOT_SIMPLE,0,0,300
unique_entity_total_entity_ratio_paragraph_label=1_thres=0.2,0,0,300
unique_entity_total_entity_ratio_paragraph_label=1_thres=0.3,0,0,300
unique_entity_total_entity_ratio_paragraph_label=1_thres=0.4,0,0,300
few_noun_phrases_ratio_thres=0.1_label=1,0,0,300
