In [None]:
import pandas as pd
import flyingsquid
from pathlib import Path
import zipfile
from snorkel.labeling import labeling_function
import math
import numpy as np
import sklearn

In [None]:
# create training and dev set
def get_train_dev_set(panda_match_file):
    match_file = Path("panda_matches") / f"panda_label_{panda_match_file}.csv"
    matches = pd.read_csv(match_file)
    # use snorkel.label.apply.pandas.ApplyLFPandas but modified to work on tuples?
    return matches

Order of events:
- [ ] Labeling function development with Panda
- [ ] Word embeddings for string columns
- [ ] Candidate set creation with KNN
- [ ] Correct infrastructure for LF applier and LF model
- [ ] LF Applier
- [ ] LF Mdel

String column word embeddings - PyTorch?:
- need to learn word embeddings so the string column can be projected into vector space

Candidate Set - sklearn:
- all the left dataframe rows matched with all potential right dataframe rows
- left (l, i), right (r, j)
- candidate set (n, k) where n <= lxr, k <= i+j
- In Panda the candidate set (blocked tuples) is selected using modified top-k cosine similarity search. 
  - Intuitively, project all tuples from both left and right tables as dense vectors and perform top-k search wrt. each left tuple and top-k search wrt. each right tuple. 
  - 1. for each left tuple, consider top k similar right tuples
  - 2. for each right tuple consider top k similar left tuples
  - 3. consider the union of these pairs
  - Example numbers: left - 18182, right - 194309, candidate set - 82225
- apply manual blocking: report_year must be an exact match

LF Applier - Snorkel:
- takes candidate set dataframe
- takes m labeling functions
- for each row apply each labeling function
- returns a matrix (n, m)

Labeling Model - FlyingSquid:
- need to refine labeling functions

In [None]:
# implement modified top k cosine similarity search
def create_candidate_set(path_to_zip_file, test_size=False):
    with zipfile.ZipFile(path_to_zip_file, 'r') as z:
        ferc = pd.read_csv(z.open("left.csv"))
        eia = pd.read_csv(z.open("right.csv"))
    if test_size:
        ferc = ferc[:20]
        eia = eia[:20]
    return ferc.merge(eia, how="cross", suffixes=("_ferc", "_eia"))

In [None]:
df = create_candidate_set("panda_inputs/2020_gens.zip", test_size=True)

In [None]:
with zipfile.ZipFile("panda_inputs/2020_gens.zip", 'r') as z:
    ferc = pd.read_csv(z.open("left.csv"))
    eia = pd.read_csv(z.open("right.csv"))
ferc = ferc[:100]
eia = eia[:100]

In [None]:
df = create_candidate_set("panda_inputs/2020_gens.zip")

### Test KNN Candidate Set Creation

- How to embed string columns? - use word embeddings?
- Normalize vectors
- Only using shared columns?

In [None]:
ppl = pd.read_pickle("full_eia_plant_parts_clean.pkl")

In [None]:
ferc_clean = pd.read_pickle("full_ferc_clean.pkl")

### Test LF Applier

In [None]:
# create labeling functions
@labeling_function()
def installation_year_match(row):
    x = row.installation_year_x
    y = row.installation_year_y

    if not math.isnan(x) and not math.isnan(y) and abs(x - y) < 2:
        return 1
    else:
        return 0
    
@labeling_function()
def construction_year_match(row):
    x = row.construction_year_x
    y = row.construction_year_y

    if not math.isnan(x) and not math.isnan(y) and abs(x - y) < 2:
        return 1
    else:
        return 0

In [None]:
from snorkel.labeling import PandasLFApplier

In [None]:
test = df[:1000]

In [None]:
applier = PandasLFApplier([installation_year_match, construction_year_match])
labels = applier.apply(test)

In [None]:
np.where(labels == 1)[0]

In [None]:
'''
This example code shows a bare-minimum example of how to get FlyingSquid up and
running.
'''

from flyingsquid.label_model import LabelModel

# n_train: number of training records
# n_dev: number of dev records
# m: number of LFs
# L_train shape: (n_train, m)
# L_dev shape: (n_dev, m)
# Y_dev: (n_dev,)
# n_train + n_dev = total candidate set size
L_train, L_dev, Y_dev = synthetic_data_basics()
'''
m = L_train.shape[1]
label_model = LabelModel(m)

label_model.fit(L_train)

preds = label_model.predict(L_dev).reshape(Y_dev.shape)
accuracy = np.sum(preds == Y_dev) / Y_dev.shape[0]

print('Label model accuracy: {}%'.format(int(100 * accuracy)))
'''