In [None]:
import pickle
import numpy as np
from labeling_functions import get_all_lfs
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)
all_lfs = get_all_lfs()


In [None]:
labels_simp = pickle.load(open("/workspace/datasets/eval_simp_labels.pkl", "rb"))
labels_src = pickle.load(open("/workspace/datasets/eval_src_labels.pkl", "rb"))

In [None]:
from snorkel.labeling import LFAnalysis
lfa_simp = LFAnalysis(L=labels_simp, lfs=all_lfs).lf_summary()
lfa_src = LFAnalysis(L=labels_src, lfs=all_lfs).lf_summary()

In [None]:
lfa_simp

In [None]:
lfa_src

In [None]:
import pandas as pd

pd.DataFrame(lfa_simp).to_excel("/workspace/datasets/labels_simp.xlsx")  
pd.DataFrame(lfa_src).to_excel("/workspace/datasets/labels_src.xlsx")  

In [None]:
df_simp = pd.DataFrame(lfa_simp)
df_src = pd.DataFrame(lfa_src)

In [None]:
df_simp.columns

In [None]:
merged_data = []

for index, row in df_simp.iterrows():
    polarity = -1
    cov_simp = row['Coverage']
    cov_src = df_src.loc[index]['Coverage']

    if len(row['Polarity']) > 0:
        polarity = row['Polarity'][0]
    else:
        if len(df_src.loc[index]['Polarity']) > 0:
            polarity = df_src.loc[index]['Polarity'][0]

    if polarity == -1:
        continue

    precision = -1

    if polarity == 0:
        precision = cov_simp/(cov_simp + cov_src)
    else:
        precision = cov_src/(cov_simp + cov_src)

    merged_data.append([index, polarity, cov_simp, cov_src, precision, 1 - precision, cov_simp+cov_src, abs(cov_simp - cov_src), abs(cov_simp - cov_src)/(cov_simp+cov_src)])



In [None]:
df_md = pd.DataFrame(merged_data)
df_md.columns = ['name', 'polarity', 'cov_simp', 'cov_src', 'precision', 'inv_precision', 'total_coverage', 'distance', 'norm_dist']

#### Keep/throw decisions for LFs

In [None]:
decision = []

for index, row in df_md.iterrows():
    if row['precision'] > 0:
        if row['total_coverage'] >= 0.05:
            if row['precision'] >= 0.7:
                decision.append('JA')
            else: 
                if row['precision'] >= 0.5 and row['norm_dist'] >= 0.05:
                    decision.append('JA')
                else:
                    if row['precision'] <= 0.3 and row['distance'] >= 0.01:
                        decision.append('INVERSE')
                    else:
                        decision.append('NEIN')
        else:
            if row['precision'] > 0.5 and row['distance'] >= 0.005 and row['total_coverage'] >= 0.02:
                decision.append('JA')
            else:
                decision.append('NEIN')
    else:
        decision.append('NEIN')

df_md['decision'] = decision

#### Information, which LFs could require more thresholds

In [None]:
!pip install Levenshtein

In [None]:
from Levenshtein import distance

In [None]:
def are_names_similar(a, b):
    if a['polarity'] != b['polarity']:
        return False

    # Levenshtein
    d = distance(a['name'], b['name'])

    if d > 2:
        return False
    return True


In [None]:
thresholds = []

empty_row = pd.DataFrame([['nothing', -1, 0, 0, 0, 0, 0, 0, 0, -1]])
empty_row.columns = ['name', 'polarity', 'cov_simp', 'cov_src', 'precision', 'inv_precision', 'total_coverage', 'distance', 'norm_dist', 'decision']

for index, row in df_md.iterrows():
    if index == 0:
        prev = empty_row.iloc[0]
    else:
        prev = df_md.iloc[index-1]
    
    curr = row

    if index == len(df_md) - 1:
        next = empty_row.iloc[0]
    else:
        next = df_md.iloc[index+1]

    if curr['precision'] > 0.5 and curr['norm_dist'] > 0.1:
        if are_names_similar(prev, curr) and not are_names_similar(next, curr) and (curr['precision'] > prev['precision'] or curr['norm_dist'] > prev['norm_dist']):
            thresholds.append('NEW')
        else:
            if are_names_similar(next, curr) and not are_names_similar(prev, curr) and (curr['precision'] > next['precision'] or curr['norm_dist'] > next['norm_dist']):
                thresholds.append('NEW')
            else:
                thresholds.append('IGNORE')
    else:
        thresholds.append('IGNORE')

df_md['new_thresholds'] = thresholds

In [None]:
df_md.to_excel("/workspace/datasets/merged_label_data.xlsx")  


#### 