In [1]:
import os
import warnings
import random
import pandas as pd
from itertools import combinations
import sys
import networkx as nx
import spacy
import traceback
import pickle
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.ensemble import AdaBoostClassifier

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
def logistic_regression(x_train, x_test, y_train, y_test, mod_type):
    print("--------Logistic Regression-----------")
    lr =  LogisticRegression(n_jobs=10)
    lr.fit(x_train, y_train)
    y_pred=lr.predict(x_test)
    pickle.dump(lr, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"_lr_model_"+dataset+".sav", "wb"))

    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_logistic_regression_metrics.txt', 'w') as f:
        f.write('Logistic Regression - ' + mod_type)
        f.write("\n")
        f.write("Coefficients: " + str(lr.coef_))
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred

In [3]:
def logistic_regression_multiple_classes(x_train, x_test, y_train, y_test, mod_type):
    print("--------Logistic Regression-----------")
    lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    lr.fit(x_train, y_train)
    y_pred=lr.predict(x_test)
    pickle.dump(lr, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"_lr_model_"+dataset+".sav", "wb"))

    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_logistic_regression_metrics.txt', 'w') as f:
        f.write('Logistic Regression - ' + mod_type)
        f.write("\n")
        f.write("Coefficients: " + str(lr.coef_))
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred

In [4]:
def random_forest(x_train, x_test, y_train, y_test, mod_type):
    print("--------Random Forest-----------")
    clf = RandomForestClassifier(n_estimators=10, class_weight='balanced', n_jobs=10, random_state=42)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    pickle.dump(clf, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"rf_model_"+dataset+".sav", "wb"))
    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_random_forest_metrics.txt', 'w') as f:
        f.write('Random Forest - ' + mod_type)
        f.write("\n")
        f.write("Coefficients: " + str(clf.feature_importances_))
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred


In [5]:
def svm(kernel, x_train, x_test, y_train, y_test, mod_type):
    print("--------Support Vector Classifier "+kernel+"-----------")
    svm =  SVC(kernel=kernel, C=0.1, class_weight='balanced')
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    pickle.dump(svm, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"svm_"+kernel+"_model_"+dataset+".sav", "wb"))
    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_svm_'+kernel+'_metrics.txt', 'w') as f:
        f.write('SVM '+kernel+' - ' + mod_type)
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred

In [6]:
def xgboost(x_train, x_test, y_train, y_test, mod_type):
    xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    xgb_model.fit(x_train, y_train)
    y_pred = xgb_model.predict(x_test)
    pickle.dump(xgb_model, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"xgboost_model_"+dataset+".sav", "wb"))
    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_xgboost_metrics.txt', 'w') as f:
        f.write('XGBoost - ' + mod_type)
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred


In [7]:
def xgboost_multiple_class(x_train, x_test, y_train, y_test, mod_type):
    xgb_model = xgb.XGBClassifier(objective="multi:softmax", random_state=42)
    xgb_model.fit(x_train, y_train)
    y_pred = xgb_model.predict(x_test)
    pickle.dump(xgb_model, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"xgboost_model_"+dataset+".sav", "wb"))
    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_xgboost_metrics.txt', 'w') as f:
        f.write('XGBoost - ' + mod_type)
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred


In [8]:
def adaboost(x_train, x_test, y_train, y_test, mod_type):
    ada = AdaBoostClassifier(n_estimators=100, random_state=42)
    ada.fit(x_train, y_train)
    y_pred = ada.predict(x_test)
    pickle.dump(ada, open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Models/"+mod_type+"adaboost_model_"+dataset+".sav", "wb"))
    with open("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Metrics/"+mod_type+'_adaboost_metrics.txt', 'w') as f:
        f.write('AdaBoost - ' + mod_type)
        f.write("\n")
        f.write("Accuracy:" + str(metrics.accuracy_score(y_test, y_pred)))
        f.write("\n")
        f.write("Precision: " + str(metrics.precision_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("Recall: " + str(metrics.recall_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write("F-Score: " + str(metrics.f1_score(y_test, y_pred, average="binary", pos_label=1)))
        f.write("\n")
        f.write(str(classification_report(y_test, y_pred)))
    return y_pred