# Preparing data

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("your-train-feature-file") # create from Build_features_extraction.ipynb 
data_dev = pd.read_csv("your-dev-feature-file")

In [None]:
print(f'train size: {len(data)} dev size: {len(data_dev)}')

In [None]:
# convert label to entialment and non-entialment
def convert_label(gold_label):
  if gold_label == 'contradiction' or gold_label == 'neutral':
    return 0
  else:
    return 1

In [None]:
data['label'] = data['gold_label'].apply(convert_label)
data.drop('gold_label', inplace=True, axis=1)
data.drop('Unnamed: 0', inplace=True, axis=1)

data_dev['label'] = data_dev['gold_label'].apply(convert_label)
data_dev.drop('gold_label', inplace=True, axis=1)
data_dev.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
# convert some feature to int
def convert_feature(feature):
    if feature == "-":
        return 0
    else: #hard or easy
        return 1

In [None]:
data['swapping'] = data['word_swapping'].apply(convert_feature)
data['neg'] = data['negation'].apply(convert_feature)
data['sub'] = data['subsequence'].apply(convert_feature)
data['cons'] = data['constituent'].apply(convert_feature)
data['ant'] = data['antonym'].apply(convert_feature)

data_dev['swapping'] = data_dev['word_swapping'].apply(convert_feature)
data_dev['neg'] = data_dev['negation'].apply(convert_feature)
data_dev['sub'] = data_dev['subsequence'].apply(convert_feature)
data_dev['cons'] = data_dev['constituent'].apply(convert_feature)
data_dev['ant'] = data_dev['antonym'].apply(convert_feature)

In [None]:
data.drop('word_swapping', inplace=True, axis=1)
data.drop('negation', inplace=True, axis=1)
data.drop('subsequence', inplace=True, axis=1)
data.drop('constituent', inplace=True, axis=1)
data.drop('antonym', inplace=True, axis=1)

data_dev.drop('word_swapping', inplace=True, axis=1)
data_dev.drop('negation', inplace=True, axis=1)
data_dev.drop('subsequence', inplace=True, axis=1)
data_dev.drop('constituent', inplace=True, axis=1)
data_dev.drop('antonym', inplace=True, axis=1)

In [None]:
mask = data['label'] != ''
mask

In [None]:
mask_dev = data_dev['label'] != ''
mask_dev

In [None]:
mask.value_counts(), mask_dev.value_counts()

In [None]:
_data = data[mask]
# _columns_to_keep = ['label', 'neg', 'ant', 'sub', 'cons', 'swapping', 'overlapping score', 'hypo_len'] 
_columns_to_keep = ['label', 'overlapping score'] 

In [None]:
_data_dev = data_dev[mask]
# _columns_to_keep_dev = ['label', 'neg', 'ant', 'sub', 'cons', 'swapping', 'overlapping score', 'hypo_len']
_columns_to_keep_dev = ['label', 'overlapping score']

In [None]:
columns_to_keep_X_train = ['label']

In [None]:
print(len(_columns_to_keep))

In [None]:
_data = _data[_columns_to_keep]
_data_dev = _data_dev[_columns_to_keep_dev]

In [None]:
X_train_score = _data[columns_to_keep_X_train]

In [None]:
X_train_score.head()

In [None]:
X_test_score = _data_dev[_columns_to_keep_dev]
X_test_score["pairID"] = data_dev.pairID

In [None]:
X_test_score.head()

In [None]:
_data.to_csv("train_data_edited.csv")
_data_dev.to_csv("dev_data_edited.csv")

# Bias model

In [None]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import random
import matplotlib.pyplot as plt
from scipy import optimize
import time
import seaborn as sns

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [None]:
def define_clfs_params(grid_size):
    """Define defaults for different classifiers.
    Define three types of grids:
    Test: for testing your code
    Small: small grid
    Large: Larger grid that has a lot more parameter sweeps
    """

    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3) 
            }

    large_grid = {'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
        'LR': {'penalty': ['l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'class_weight': ['balanced', None]},
        'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
        'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }
    
    small_grid = {'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]}, 
    'LR': {'penalty': ['l2'], 'C': [0.00001,0.001,0.1,1,10], 'class_weight': ['balanced', None]},
    'SGD': {'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': {'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

    gam_grid = {
        'LR': {'penalty': ['l2'], 'C': [0.01], 'class_weight': ['balanced']},
        'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
           }
    
    test_grid = {'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]}, 
    'LR': {'penalty': ['l2'], 'C': [0.01], 'class_weight': ['balanced', None]},
    'SGD': {'loss': ['perceptron'], 'penalty': ['l1', 'elasticnet']},
    'ET': {'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'AB': {'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5], 'weights': ['uniform'], 'algorithm': ['auto']}
           }
    
    if (grid_size == 'gam'):
        return clfs, gam_grid
    elif (grid_size == 'small'):
        return clfs, small_grid
    elif (grid_size == 'test'):
        return clfs, test_grid
    else:
        return 0, 0

In [None]:
feat = [i for i in _columns_to_keep if i != 'label']
len(feat)

In [None]:
df = pd.read_csv("train_data_edited.csv")
df_dev = pd.read_csv("dev_data_edited.csv")

# select features to use
features  =  feat
X_train = df[features]
X_test = df_dev[features]

# define label
y_train = df['label']
y_test = df_dev['label']

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
NOTEBOOK = 0

def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):
    """Runs the loop using models_to_run, clfs, gridm and the data
    """
    results_df =  pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'acc', 'classification report', 'confusion matrix'))

    for n in range(1, 2):
        # create training and valdation sets
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train, y_train).predict(X_test) # HERE
                    roc_y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred_probs, labels = [0, 1]), index = [0, 1], columns = [0, 1])


                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [models_to_run[index], clf, p,
                                                       roc_auc_score(y_test, roc_y_pred_probs),
                                                       sklearn.metrics.accuracy_score(y_test, y_pred_probs),
                                                       classification_report(y_test, y_pred_probs, output_dict=True),
                                                       confusion.to_dict(orient="list")]

                    #print(results_df)
                    if NOTEBOOK == 1:
                        plot_precision_recall_n(y_test, y_pred_probs, clf)
                except IndexError as e:
                    print('Error:', e)
                    continue

    return results_df

In [None]:
def main():

    # define grid to use: test, small, large
    grid_size = 'gam'
    clfs, grid = define_clfs_params(grid_size)

    # define models to run
    models_to_run=['LR']

    # call clf_loop and store results in results_df
    results_df = clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test)
    print(results_df)
    if NOTEBOOK == 1:
        results_df

    # save to csv
    results_df.to_csv('results.csv', index=False)


if __name__ == '__main__':
    main()

In [None]:
logmodel = LogisticRegression(penalty='l2', C=0.01, class_weight='balanced')
t = logmodel.fit(X_train, y_train)

### Get prediction and probability of train sample

In [None]:
predictions = t.predict(X_train)
predict_prob = t.predict_proba(X_train)[:,1]
X_train_score['prediction'] = predictions
X_train_score['prob_score'] = predict_prob

In [None]:
X_train_score

### Get prediction and probability of test sample

In [None]:
predictions = t.predict(X_test)
predict_prob = t.predict_proba(X_test)[:,1]
X_test_score['prediction'] = predictions
X_test_score['prob_score'] = predict_prob

In [None]:
X_test_score