# Preparing data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("qqp_train_overlapping_features.csv") 
data_val = pd.read_csv("qqp_val_overlapping_features.csv")
data_dev = pd.read_csv("qqp_dev_overlapping_features.csv")
data_paws = pd.read_csv("qqp_paws_dev_and_test_overlapping_features.csv")

data.head()

Unnamed: 0.1,Unnamed: 0,is_duplicate,sentence1,sentence2,id,pair_label,lexical_overlap,word_swapping,hypo_len,overlapping score
0,0,1,"What are the best books for IBPS PO, SBI SO, S...",What are the best books for Bank P.O./IBPS pre...,296291,"('What are the best books for IBPS PO, SBI SO,...",0,Not Swap,0.033898,0.666667
1,1,1,What are some of the results of The Congress o...,What were the results of the Congress of Vienna?,206762,('What are some of the results of The Congress...,1,Not Swap,0.033898,0.888889
2,2,1,Is our PM Modi doing the correct thing with 50...,What do you think about Modi's new policy on t...,397289,('Is our PM Modi doing the correct thing with ...,0,Swap,0.072034,0.388889
3,3,0,How can I use induction motor as generator?,How can we operate induction motor as a induct...,348835,('How can I use induction motor as generator?'...,0,Not Swap,0.038136,0.7
4,4,0,What are the differences between CRF and N-gra...,What is the difference between batch-mode and ...,137366,('What are the differences between CRF and N-g...,0,Not Swap,0.042373,0.545455


# Train model

In [7]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import random
import matplotlib.pyplot as plt
from scipy import optimize
import time
import seaborn as sns

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [8]:
def define_clfs_params(grid_size):
    """Define defaults for different classifiers.
    Define three types of grids:
    Test: for testing your code
    Small: small grid
    Large: Larger grid that has a lot more parameter sweeps
    """

    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3) 
            }

    large_grid = {'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
        'LR': {'penalty': ['l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'class_weight': ['balanced', None]},
        'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
        'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }
    
    small_grid = {'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]}, 
    'LR': {'penalty': ['l2'], 'C': [0.00001,0.001,0.1,1,10], 'class_weight': ['balanced', None]},
    'SGD': {'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': {'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

    gam_grid = {
        'LR': {'penalty': ['l2'], 'C': [0.01], 'class_weight': ['balanced']},
        'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
           }
    
    test_grid = {'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]}, 
    'LR': {'penalty': ['l2'], 'C': [0.01], 'class_weight': ['balanced', None]},
    'SGD': {'loss': ['perceptron'], 'penalty': ['l1', 'elasticnet']},
    'ET': {'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'AB': {'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5], 'weights': ['uniform'], 'algorithm': ['auto']}
           }
    
    if (grid_size == 'gam'):
        return clfs, gam_grid
    elif (grid_size == 'small'):
        return clfs, small_grid
    elif (grid_size == 'test'):
        return clfs, test_grid
    else:
        return 0, 0

In [9]:
NOTEBOOK = 0

def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):
    """Runs the loop using models_to_run, clfs, gridm and the data
    """
    results_df =  pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'acc', 'classification report', 'confusion matrix'))

    for n in range(1, 2):
        # create training and valdation sets
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train, y_train).predict(X_test) # HERE
                    roc_y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred_probs, labels = [0, 1, 2]), index = [0, 1, 2], columns = [0, 1, 2])


                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [models_to_run[index], clf, p,
                                                       roc_auc_score(y_test, roc_y_pred_probs),
                                                       sklearn.metrics.accuracy_score(y_test, y_pred_probs),
                                                       classification_report(y_test, y_pred_probs, output_dict=True),
                                                       confusion.to_dict(orient="list")]

                    #print(results_df)
                    if NOTEBOOK == 1:
                        plot_precision_recall_n(y_test, y_pred_probs, clf)
                except IndexError as e:
                    print('Error:', e)
                    continue

    return results_df

In [33]:
# select features to use
X_train = data.iloc[:, 9:].values
X_val = data_val.iloc[:, 11:].values
X_dev = data_dev.iloc[:, 11:].values
X_paws = data_paws.iloc[:, 9:].values

# define label
y_train = data.iloc[:, 1].values
y_val = data_val.iloc[:, 6].values
y_dev = data_dev.iloc[:, 6].values
y_paws = data_paws.iloc[:, 4].values

In [34]:
def main():

    # define grid to use: test, small, large
    grid_size = 'gam'
    clfs, grid = define_clfs_params(grid_size)

    # define models to run
    models_to_run=['LR']

    # call clf_loop and store results in results_df
    results_df = clf_loop(models_to_run, clfs, grid, X_train, X_dev, y_train, y_dev)
    print(results_df)
    if NOTEBOOK == 1:
        results_df


if __name__ == '__main__':
    main()

LR
  model_type                                                clf  \
0         LR  LogisticRegression(C=0.01, class_weight='balan...   

                                          parameters   auc-roc     acc  \
0  {'C': 0.01, 'class_weight': 'balanced', 'penal...  0.739948  0.6794   

                               classification report  \
0  {'0': {'precision': 0.7965583173996176, 'recal...   

                                    confusion matrix  
0  {0: [2083, 532, 0], 1: [1071, 1314, 0], 2: [0,...  


In [35]:
logmodel = LogisticRegression(penalty='l2', C=0.01, class_weight='balanced')
lr_model = logmodel.fit(X_train, y_train)

In [36]:
pred_train = lr_model.predict_proba(X_train).tolist()
pred_val = lr_model.predict_proba(X_val).tolist()
pred_dev = lr_model.predict_proba(X_dev).tolist()
pred_paws = lr_model.predict_proba(X_paws).tolist()

In [37]:
new_train_df = pd.DataFrame()
new_train_df["is_duplicate"] = data["is_duplicate"]
new_train_df["sentence1"] = data["sentence1"]
new_train_df["sentence2"] = data["sentence2"]
new_train_df["bias_probs"] = pred_train

new_val_df = pd.DataFrame()
new_val_df["is_duplicate"] = data_val["is_duplicate"]
new_val_df["sentence1"] = data_val["sentence1"]
new_val_df["sentence2"] = data_val["sentence2"]
new_val_df["bias_probs"] = pred_val

new_dev_df = pd.DataFrame()
new_dev_df["is_duplicate"] = data_dev["is_duplicate"]
new_dev_df["sentence1"] = data_dev["sentence1"]
new_dev_df["sentence2"] = data_dev["sentence2"]
new_dev_df["bias_probs"] = pred_dev

new_paws_df = pd.DataFrame()
new_paws_df["is_duplicate"] = data_paws["is_duplicate"]
new_paws_df["sentence1"] = data_paws["sentence1"]
new_paws_df["sentence2"] = data_paws["sentence2"]
new_paws_df["bias_probs"] = pred_paws

new_train_df.shape, new_val_df.shape, new_dev_df.shape, new_paws_df.shape

((399287, 4), (5000, 4), (5000, 4), (677, 4))

In [38]:
# replace nall to n\/a
new_train_df.sentence2 = new_train_df.sentence2.fillna('n\/a')
new_train_df.head()

Unnamed: 0,is_duplicate,sentence1,sentence2,bias_probs
0,1,"What are the best books for IBPS PO, SBI SO, S...",What are the best books for Bank P.O./IBPS pre...,"[0.3567053121189019, 0.6432946878810981]"
1,1,What are some of the results of The Congress o...,What were the results of the Congress of Vienna?,"[0.20908534776288057, 0.7909146522371194]"
2,1,Is our PM Modi doing the correct thing with 50...,What do you think about Modi's new policy on t...,"[0.5832793916113902, 0.4167206083886098]"
3,0,How can I use induction motor as generator?,How can we operate induction motor as a induct...,"[0.33163352365953647, 0.6683664763404635]"
4,0,What are the differences between CRF and N-gra...,What is the difference between batch-mode and ...,"[0.45372159187823935, 0.5462784081217607]"


In [39]:
import json

train_json = new_train_df.to_json(orient='records', lines=True)
with open('data/paraphrase_identification/train_prob_qqp_lr.jsonl', 'w') as json_file:
    json_file.write(train_json)

val_json = new_val_df.to_json(orient='records', lines=True)    
with open('data/paraphrase_identification/val_prob_qqp_lr.jsonl', 'w') as json_file:
    json_file.write(val_json)
    
dev_json = new_dev_df.to_json(orient='records', lines=True)    
with open('data/paraphrase_identification/dev_prob_qqp_lr.jsonl', 'w') as json_file:
    json_file.write(dev_json)
    
paws_json = new_paws_df.to_json(orient='records', lines=True)    
with open('data/paraphrase_identification/paws_dev_and_test_prob_qqp_lr.jsonl', 'w') as json_file:
    json_file.write(paws_json)

# Dataset maker reweight

In [40]:
def make_sample_weight(probs, label):
    prob = probs[label]
    return 1/prob

def make_bias_prob(probs, label):
    return probs[label]

In [50]:
new_train_df['sample_weight'] = new_train_df[['bias_probs', 'is_duplicate']].apply(lambda x: make_sample_weight(*x), axis=1)

new_train_df['bias_prob'] = new_paws_df[['bias_probs', 'is_duplicate']].apply(lambda x: make_bias_prob(*x), axis=1)


In [51]:
new_paws_df.head()

Unnamed: 0,is_duplicate,sentence1,sentence2,bias_probs,sample_weight,bias_prob
0,0,What were the major effects of the cambodia ea...,What were the major effects of the Iquique ear...,"[0.1543575681999978, 0.8456424318000022]",6.478464,0.154358
1,0,The guy I 'm dating never texts me and I feel ...,The guy I 'm dating never wants me and I feel ...,"[0.16584672552959534, 0.8341532744704047]",6.029664,0.165847
2,0,How do I make my new phone number as group adm...,How do I make my old phone number as group adm...,"[0.1543575681999978, 0.8456424318000022]",6.478464,0.154358
3,0,Why ca n't countries afford high quality produ...,Why ca n't countries afford China 's high qual...,"[0.18940105962073162, 0.8105989403792684]",5.279802,0.189401
4,0,Do Mexican women like East Asian men ( Korean ...,Do East Asian women like Mexican men ( Korean ...,"[0.1543575681999978, 0.8456424318000022]",6.478464,0.154358


In [52]:
paws_json = new_paws_df.to_json(orient='records', lines=True)
with open('data/paraphrase_identification/paws_dev_and_test_overlap_only_bias_weighted.jsonl', 'w') as json_file:
    json_file.write(paws_json)