In [121]:
import pickle
import pandas as pd
from labeling_functions import get_all_lfs
from snorkel.labeling import PandasLFApplier
from tqdm import tqdm
import numpy as np
from multiprocessing import Process, Manager

In [122]:
def apply_lfs(df, lfs, chunk_start, size, label_dic):
    applier = PandasLFApplier(lfs)
    labels = applier.apply(df[chunk_start:chunk_start+size], progress_bar=False)
    label_dic[chunk_start] = labels

In [123]:
def preprocess_dataset(dataset, sel_ds_id, app_type="simp"):
    
    if sel_ds_id == "eval":
        dataset = dataset[dataset['val_split'] == True]
    else:
        dataset = dataset[dataset['ds_id'] == sel_ds_id]        

    dataset['simplified_snt'] = dataset[app_type]
    dataset['source_snt'] = dataset['src']

    return dataset

In [124]:
def regen_labels(label_dic):
    labels = label_dic[sorted(label_dic.keys())[0]].squeeze()
    for key in sorted(label_dic.keys())[1:]:
        labels = np.concatenate((labels, label_dic[key].squeeze()))
    return labels

In [125]:
def result_for_lf(labels):
    abstain_cnt = np.sum(labels == -1)
    simple_cnt = np.sum(labels == 0)
    complex_cnt = np.sum(labels == 1)
    return abstain_cnt, simple_cnt, complex_cnt

In [126]:
with open("/workspace/datasets/final_combined_with_index_with_arts", 'rb') as f:
    dataset = pickle.load(f)

In [127]:
ds_to_test_on = "arts300"
all_lfs = get_all_lfs()
test_dataset = preprocess_dataset(dataset, ds_to_test_on)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['simplified_snt'] = dataset[app_type]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['source_snt'] = dataset['src']


In [128]:
num_processes = 5
batch_size = 20
all_batch_indices = list(range(0, len(test_dataset), batch_size))

manager = Manager()
label_dic = manager.dict()
procs = []


with tqdm(total = len(all_batch_indices)) as pbar:
    while len(all_batch_indices) > 0:
        for i in range(num_processes):
            current_index = all_batch_indices.pop()
            proc = Process(target=apply_lfs, args=(test_dataset, all_lfs, current_index , batch_size, label_dic))
            procs.append(proc)
            proc.start()
            pbar.update(1)
            
        for proc in procs:
            proc.join()

    label_dic = dict(label_dic)

100%|██████████| 15/15 [02:40<00:00, 10.67s/it]


In [129]:
full_labels = regen_labels(label_dic)

In [130]:
lf_stat_dic = {}

for i in range(len(all_lfs)):
    lf_stat_dic[all_lfs[i].name] = result_for_lf(full_labels[:,i])
    #print(all_lfs[i].name, result_for_lf(full_labels[:,i]))

In [135]:
lf_stats = pd.DataFrame(lf_stat_dic).T
lf_stats.columns = ['Abstain', "Simple", "Complex"]

In [136]:
lf_stats.to_excel("/workspace/lf_stats.xlsx")

In [137]:
lf_stats

Unnamed: 0,Abstain,Simple,Complex
lf_words_cnt_wcount=3_SIMPLE,299,1,0
lf_words_cnt_wcount=4_SIMPLE,297,3,0
lf_words_cnt_wcount=5_SIMPLE,296,4,0
lf_words_cnt_wcount=6_SIMPLE,292,8,0
lf_words_cnt_wcount=7_SIMPLE,285,15,0
...,...,...,...
few_noun_phrases_thres=16_label=1,294,0,6
few_noun_phrases_thres=17_label=1,296,0,4
few_noun_phrases_thres=18_label=1,298,0,2
few_noun_phrases_thres=19_label=1,298,0,2
