In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
import pickle
from tqdm.notebook import tqdm

from functools import lru_cache

from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

#from labeling_functions import get_all_lfs
from pruning_lfs import prune_lfs

import glob

In [None]:
all_lfs = prune_lfs()

In [None]:
len(all_lfs)

In [None]:
with open("workspace/datasets/final_combined_with_index.pkl", 'rb') as f:
    dataset = pickle.load(f)

In [None]:
ABSTAIN = -1
SIMPLE = 0
NOT_SIMPLE = 1
LOST_MEANING = 2

label_map = {-1: "ABSTAIN", 0: "SIMPLE", 1: "NOT_SIMPLE", 2: "LOST_MEANING"}

In [None]:
current_type = "simp"
#current_type = "src"

In [None]:
dataset['simplified_snt'] = dataset[current_type]
dataset['source_snt'] = dataset['src']

In [32]:
selected_dataset = "MTurkSF"

In [None]:
dataset = dataset[dataset['ds_id'] == selected_dataset]

In [None]:
dataset

In [None]:
label_path = f"/workspace/datasets/{selected_dataset}labels/"

In [None]:
if not os.path.isdir(label_path):
    os.mkdir(label_path)

In [None]:
def get_finished_batches(path = f"/workspace/datasets/{selected_dataset}labels/*"):
    label_paths = glob.glob(path)
    fin_batches = [int(path.split("_")[-1]) for path in label_paths if "labels_" in path]
    return set(fin_batches)

In [None]:
def save_used_lfs(all_lfs, path = f"/workspace/datasets/{selected_dataset}labels/used_lfs.pkl"):
    names = [lf.name for lf in all_lfs]
    pickle.dump(names, open(path, "wb"))

In [None]:
batch_size = 20
start = 0

save_used_lfs(all_lfs)

for i in tqdm(range(start, len(dataset), batch_size), position=1):
    if not i in get_finished_batches():
        try:
            applier = PandasLFApplier(all_lfs)
            labels = applier.apply(dataset[i:i+batch_size], progress_bar=True)
            
            pickle.dump(labels, open(f"/workspace/datasets/{selected_dataset}labels/labels_{current_type}_{i}", "wb"))
            print(f"finished on {i}/{len(dataset)}")

        except Exception as e:
            print(f"something went wrong with batch {i}")
            print(e)

In [16]:
#build ds labels
ds_label_path = f"/workspace/datasets/ds_labels"
if not os.path.isdir(ds_label_path):
    os.mkdir(ds_label_path)

In [29]:
def build_labels(ds="MTurkSF"):
    simp_paths = sorted(glob.glob(f"workspace/datasets/{ds}labels/labels_simp*"))
    src_paths = sorted(glob.glob(f"workspace/datasets/{ds}labels/labels_src*"))

    print(simp_paths)
    simp_labels = [pickle.load(open(path, "rb")) for path in simp_paths]
    src_labels = [pickle.load(open(path, "rb")) for path in src_paths]

    simp_labels = np.concatenate(simp_labels)
    src_labels = np.concatenate(src_labels)

    pickle.dump(simp_labels, open(f"{ds_label_path}/{ds}_simp_labels.pkl", "wb"))
    pickle.dump(src_labels, open(f"{ds_label_path}/{ds}_src_labels.pkl", "wb"))

In [33]:
selected_dataset

'MTurkSF'

In [34]:
build_labels(selected_dataset)

['workspace/datasets/MTurkSFlabels/labels_simp_0', 'workspace/datasets/MTurkSFlabels/labels_simp_100', 'workspace/datasets/MTurkSFlabels/labels_simp_200']


In [26]:
test =pickle.load(open("/workspace/datasets/ds_labels/britannica_simp_labels.pkl", "rb"))

In [28]:
test.shape

(600, 311)