In [17]:
import json
import spacy
import nltk
import numpy as np
from tqdm import tqdm, tqdm_notebook
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import seaborn as sns
from xgboost.sklearn import XGBClassifier 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
import pickle
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import warnings
warnings.filterwarnings('ignore')

In [None]:
def TrainXGBoost(X_train,Y_train,X_test,Y_test):
    
    '''
    Trains a XGboost Classifier
    
    '''
    xclas = XGBClassifier()  # and for classifier  
    xclas.fit(X_train, np.array(Y_train))
    
    y_pred = xclas.predict(X_test)
    Accuracy=accuracy_score(Y_test, y_pred)
    Precision=precision_score(Y_test, y_pred)
    Recall=recall_score(Y_test, y_pred)
    
    df= pd.DataFrame({'Accuracy':Accuracy,'Precision':Precision,'Recall':Recall})
    #Save model file
    filename = "../../models/"+feature_name+'_xgboost_model.sav'
    pickle.dump(xclas, open(filename, 'wb'))
    
    #Save performance_report
    df.to_csv("../../Results/"+ feature_name+ "_XGBoost_report.csv")
    
    

def TrainLogisticRegression(X_train,Y_train,X_test,Y_test):
    
    '''
    
    Trains a Logistic Regression Classifier
    '''
    
    lgr = LogisticRegression()  # and for classifier  
    lgr.fit(X_train, np.array(Y_train))
    
    y_pred = lgr.predict(X_test)
    Accuracy=accuracy_score(Y_test, y_pred)
    Precision=precision_score(Y_test, y_pred)
    Recall=recall_score(Y_test, y_pred)
    
    df= pd.DataFrame({'Accuracy':Accuracy,'Precision':Precision,'Recall':Recall})
    #Save model file
    filename = "../../models/"+feature_name+'_logistic_model.sav'
    pickle.dump(lgr, open(filename, 'wb'))
    
    #Save performance_report
    df.to_csv("../../Results/"+ feature_name+ "_Logistic_report.csv")



def TrainNaiveBayes(X_train,Y_train,X_test,Y_test):
    
    '''
    Trains a Naive Bayes Classifier
    
    '''
    
    gnb = GaussianNB()  # and for classifier  
    gnb.fit(X_train, np.array(Y_train))
    
    y_pred = gnb.predict(X_test)
    Accuracy=accuracy_score(Y_test, y_pred)
    Precision=precision_score(Y_test, y_pred)
    Recall=recall_score(Y_test, y_pred)
    
    df= pd.DataFrame({'Accuracy':Accuracy,'Precision':Precision,'Recall':Recall})
    #Save model file
    filename = "../../models/"+feature_name+'_NaiveBayes_model.sav'
    pickle.dump(gnb, open(filename, 'wb'))
    
    #Save performance_report
    df.to_csv("../../Results/"+ feature_name+ "_NaiveBayes_report.csv")
    
    

In [None]:
def main(train,test,feature_name,XGBoost=False, NaiveBayes=False,LogisticRegression=False,doc2vec=False):
    
    '''
    Input:
    
    train: train features
    test: test features
    XGboost: bool, if True trains a XGboost Classifier
    NaiveBayes: bool, if True trains a NaiveBayes Classifier
    LogisticRegression: bool, if True trains a LogisticRegression classifier
    
    Output:
    
    Saves the trained model and returns the classification Accurcay/Precision and Recall of the model
    
    '''
    if not doc2vec:
        X_train = train.loc[:, train.columns != 'label']
        X_test = test.loc[:, test.columns != 'label']
        Y_train = train['label']
        Y_test = test['label']
        
    elif doc2vec:
        X_train = train
        X_test = test
        Y_train = [[1]]*len(X_train)
        Y_test = [[0]]*len(X_test)
        
    if XGBoost:
        TrainXGBoost(X_train,Y_train,X_test,Y_test,feature_name)
    
    elif NaiveBayes:
        TrainNaiveBayes(X_train,Y_train,X_test,Y_test,feature_name)
        
    elif LogisticRegression:
        TrainLogisticRegression(X_train,Y_train,X_test,Y_test,feature_name)




In [None]:
if __name__ == "__main__":
    
    
    '''
    
    Replace feature set file name as follows: 'train/test'+'_'+features
    where features = ['ner','lda','Doc2vec_embeddings']
    '''
    #Modify input train/test file as per feature set
    
    train_file = "../../data/feature/train_ner.csv"
    test_file = "../../data/feature/test_ner.csv"
    
    
    train = pd.read_csv(train__file)
    test = pd.read_csv(test_file)
    feature_name = train_file.split("_")[-1]
    
#    For Doc2Vec run this snippet instead
    
#     train = np.load("../../data/feature/train_Doc2vec_embeddings.npy")
#     test = np.load("../../data/feature/test_Doc2vec_embeddings.npy")
    
    
    #Specify "True" for specific model and "Doc2vec" if applicable
    main(train,test,feature_name,XGBoost=False, NaiveBayes=False,LogisticRegression=False,doc2vec=False)
    
    
    