In [None]:
import pyarrow.json
import pandas as pd
import numpy as np
import pickle

from snorkel.labeling import labeling_function
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

from sklearn.preprocessing import MinMaxScaler

def dicttrie(arr):
    trie = {}
    for p in tqdm(arr):
        key = p.strip()
        trie[key] = True
    return trie

def build_and_save_trie(data, file_path):
    mytrie = dicttrie(data)

    with open(file_path, "wb") as f:
        pickle.dump(mytrie, f)
    print("Save Done")

def plot_probabilities_histogram(Y):
    plt.hist(Y, bins=10)
    plt.xlabel("Probability of HIGH")
    plt.ylabel("Number of data points")
    plt.show()

In [None]:
df = pd.DataFrame()

methods = ["clip", "dfn", "tmars", "hype"]
for method in methods:
    # PATH for each score file
    path = f"/<FILE_PATH>/{method}.jsonl.gz"
    
    table = pyarrow.json.read_json(path, pyarrow.json.ReadOptions(block_size=10 << 20))
    
    columns = ["key", f'{method}']
    df_ = table.select(columns).to_pandas()

    if len(df) == 0:
        df = df_
    else:
        prev_len = len(df)
        df = df.merge(df_[["key", f"{method}"]], on="key")
        assert len(df) == prev_len

# dataframe df contains quality scores for each data sample
scaler = MinMaxScaler()
df[methods] = scaler.fit_transform(df[methods])

In [None]:
# optimal thresholds
b_c, b_d, b_t, b_h = 0.2, 0.1, 0.1, 0.1
bandwidth = 0.25

clip_th1 = np.percentile(df["clip"], 100 - b_c * (1-bandwidth) * 100)
clip_th0 = np.percentile(df["clip"], 100 - b_c * (1+bandwidth) * 100)

dfn_th1 = np.percentile(df["dfn"], 100 - b_d * (1-bandwidth) * 100)
dfn_th0 = np.percentile(df["dfn"], 100 - b_d * (1+bandwidth) * 100)

tmars_th1 = np.percentile(df["tmars"], 100 - b_t * (1-bandwidth) * 100)
tmars_th0 = np.percentile(df["tmars"], 100 - b_t * (1+bandwidth) * 100)

hype_th1 = np.percentile(df["hype"], 100 - b_h * (1-ratio) * 100)
hype_th0 = np.percentile(df["hype"], 100 - b_h * (1+ratio) * 100)
hype_th = np.percentile(df["hype"], 100 - b_h * (1) * 100)

In [None]:
HIGH = 1
LOW = 0
ABSTAIN = -1

@labeling_function()
def clip_filter(x):
    if x["clip"] >= clip_th1:
        return HIGH
    elif x["clip"] >= clip_th0:
        return ABSTAIN
    else:
        return LOW        
        
@labeling_function()
def dfn_filter(x):
    if x["dfn"] >= dfn_th1:
        return HIGH
    elif x["dfn"] >= dfn_th0:
        return ABSTAIN
    else:
        return LOW

@labeling_function()
def tmars_filter(x):
    if x["tmars"] >= tmars_th1:
        return HIGH
    elif x["tmars"] >= tmars_th0:
        return ABSTAIN
    else:
        return LOW        
        
@labeling_function()
def hype_filter(x):
    if x["hype"] >= hype_th1:
        return HIGH
    elif x["hype"] >= hype_th0:
        return ABSTAIN
    else:
        return LOW    
        
lfs = [clip_filter, dfn_filter, tmars_filter, hype_filter]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

# LFAnalysis(L=L_train, lfs=lfs).lf_summary()

In [None]:
label_model = LabelModel(cardinality=HIGH+1, verbose=True, device="cuda")
label_model.fit(L_train, n_epochs=500, seed=123, lr=0.01, log_freq=50)

probs_train = label_model.predict_proba(L=L_train)
plot_probabilities_histogram(probs_train[:, HIGH])

preds = label_model.predict(L=L_train)
A, B = np.unique(preds, return_counts=True)

mask_values = ((preds == HIGH))
keys = df["key"][mask_values]

file_path = "<OUTPUT_PATH>"
build_and_save_trie(keys, file_path)