In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier

In [2]:
import dataset
data = dataset.loadAddressPair()

In [40]:
MATCH = 2
PARTIALMATCH = 1
NOMATCH = 0
NONE = -1

@labeling_function()
def citySimilarity(x):
    if(x['cos6'] < 0.8):
        return NOMATCH
    else:
        return NONE
    

@labeling_function()
def match(x):
    similarities = ['cos0', 'cos1', 'cos2', 'cos3', 'cos4', 'cos5', 'cos6', 'cos9', 'cos10', 'cos11'
                         ,'cos12', 'cos13','cos14','cos15','cos16','cos17','cos18','cos19','cos20']
    nbPositiveSimilarity = 0
    nbNegativeSimilarity = 0
    for similarity in similarities:
        if(x[similarity] < 0.8 and x[similarity] > 0 ):
            return NONE
    return MATCH


@labeling_function()
def partialOrNoMatch(x):
    #Que la ville = NoMatch 
    #Au moins une similarité : partiel
    citySimilarity = 'cos6'
    otherSimilarities = ['cos0', 'cos1', 'cos2', 'cos3', 'cos4', 'cos5', 'cos9', 'cos10', 'cos11'
                         ,'cos12', 'cos13','cos14','cos15','cos16','cos17','cos18','cos19','cos20']
    nbPositiveSimilarity = 0
    nbNegativeSimilarity = 0
    for similarity in otherSimilarities:
        if(x[similarity] >= 0.8 or x[similarity]==0):
            nbPositiveSimilarity += 1
        else:
            nbNegativeSimilarity += 1
    if(x[citySimilarity] >= 0.8 and nbPositiveSimilarity > 0 and nbNegativeSimilarity > 0):
        return PARTIALMATCH
    elif(x[citySimilarity] >= 0.8 and nbPositiveSimilarity == 0): 
        return NOMATCH
    else:
        return NONE
    
    

@labeling_function()
def ruleClassif(x):
    if(x['cos6'] < 0.6):
        return NOMATCH
    citySimilarity = 'cos6'
    otherSimilarities = ['cos0', 'cos1', 'cos2', 'cos3', 'cos4', 'cos5', 'cos9', 'cos10', 'cos11'
                         ,'cos12', 'cos13','cos14','cos15','cos16','cos17','cos18','cos19','cos20']
    nbPositiveSimilarity = 0
    nbNegativeSimilarity = 0
    for similarity in otherSimilarities:
        if(x[similarity] >= 0.6):
            nbPositiveSimilarity += 1
        else:
            nbNegativeSimilarity += 1
    if(x['cos6'] >= 0.6 and nbPositiveSimilarity > 0 and nbNegativeSimilarity > 0):
        return PARTIALMATCH
    elif(x['cos6'] >= 0.6 and nbPositiveSimilarity == 0): 
        return NOMATCH
    else:
        return MATCH
    
@labeling_function()
def bis(x):  
    return MATCH

In [41]:
X = data.iloc[:,:19]
y = data['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
lfs = [citySimilarity, match, partialOrNoMatch]
#lfs = [ruleClassif]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_test = applier.apply(df=X_test)

  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████| 2100/2100 [00:01<00:00, 1722.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 2149.84it/s]


In [43]:
L_train

array([[ 0, -1, -1],
       [-1,  2, -1],
       [ 0, -1, -1],
       ...,
       [ 0, -1, -1],
       [ 0, -1, -1],
       [ 0, -1, -1]])

In [44]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
citySimilarity,0,[0],0.270476,0.000476,0.000476
match,1,[2],0.378095,0.001429,0.001429
partialOrNoMatch,2,[1],0.352857,0.000952,0.000952


In [45]:
print(L_train)
print(L_train[0, 2])

[[ 0 -1 -1]
 [-1  2 -1]
 [ 0 -1 -1]
 ...
 [ 0 -1 -1]
 [ 0 -1 -1]
 [ 0 -1 -1]]
-1


In [46]:
from snorkel.labeling.model import LabelModel

#label_model = LabelModel(cardinality=3, verbose=True)
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                                       | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.089]
 12%|█████████▎                                                                    | 60/500 [00:00<00:15, 27.97epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.008]
 39%|██████████████████████████████▏                                              | 196/500 [00:00<00:04, 72.55epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.001]
 60%|█████████████████████████████████████████████▌                              | 300/500 [00:00<00:01, 128.62epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 75%|█████████████████████████████████████████████████████████                   | 375/500 [00:00<00:00, 170.97epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 481.48epoch/s]
INFO:root:Finished 

In [47]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=L_train.copy(), L=L_train
)


In [48]:
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(probs=probs_train_filtered)


In [49]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)


LogisticRegression(C=1000.0, solver='liblinear')

In [50]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=y_test) * 100:.1f}%")

Test Accuracy: 40.0%
