##### This notebook contains implementations of SVM, RF and LR using the doc2vec and sent2vec embeddings!

In [None]:
import sys
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import os
from tqdm import trange
import tqdm
import time
import warnings

warnings.filterwarnings('ignore')
import pandas as pd

### Loading dataset

In [None]:
path_to_dataset = r"F:\LJPE-Dataset\LJPE Dataset2.0.csv"
data = pd.read_csv(path_to_dataset)
data.shape

In [None]:
df = data.copy()
df

In [None]:
df['split'].unique()

In [None]:
df.isnull().sum()

## doc2vec (dimension = 500)

In [None]:

saved_path = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\results"
if not os.path.exists(saved_path):
    os.makedirs(saved_path)

path_train = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\train_LJPE_500_doc2vec.npy"
# path_train_labels = sys.argv[2]
path_test = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\test_LJPE_500_doc2vec.npy"
# path_test_labels = sys.argv[4]
path_dev = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\val_LJPE_500_doc2vec.npy"
# path_dev_labels = 'val_single_ILDC_500_DOC2VEC.npy'

x_train=np.load(path_train)
# y_train=np.load(path_train_labels)
x_test=np.load(path_test)
# y_test=np.load(path_test_labels)
x_dev=np.load(path_dev)
# y_dev=np.load(path_dev_labels)

grouped = df.groupby('split')
train_df = grouped.get_group('train')
test_df = grouped.get_group('test')
val_df = grouped.get_group('val')


y_train = train_df['encoded_judgment']
y_test = test_df['encoded_judgment']
y_dev = val_df['encoded_judgment']


#Utility function that calculates the metric scores given the predicted and true labels
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP, FP, FN = [], [], []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    precision = [i for i in precision if not np.isnan(i)]
    recall = [i for i in recall if not np.isnan(i)]

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1


# RF utility function that creates the file RF_results.txt in which we vary the n_estimators parameter from 50 to 500
# with increments of 50
def RF_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"RF_results(doc500).txt")
    f = open(file_path, "w+")
    print('Fitting RandomForestClassifier')
    f.write("Varying the n_estimators from 50 to 1000\n\n")
    for n_est in tqdm.tqdm(range(50,1000,50)):
        clf=RandomForestClassifier(n_estimators=n_est)
        clf.fit(train_avg,train_labels)
        d_preds = clf.predict(dev_avg)
        Heading = "For n_estimators: " + str(n_est) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") +str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-------Dev set-------\n"+ d_res + d_metrics)
        
        t_preds = clf.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | "  + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("-----Test set------\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

# LR utility function that creates the file RF_results.txt in which we vary the max_iters parameter from 50 to 500
# with increments of 50
def LR_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"LR_results(doc500).txt")
    f = open(file_path, "w+")
    f.write("Varying the max_iters from 50 to 1000\n\n")
    print('Fitting LogisticRegression')
    for it in tqdm.tqdm(range(50,1000,50)):
        LR = LogisticRegression(C=1, max_iter =it)
        LR.fit(train_avg, train_labels)
        d_preds = LR.predict(dev_avg)
        Heading = "For max_iters: " + str(it) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-----Dev set-----\n"+ d_res + d_metrics)
        
        t_preds = LR.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " +  str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("------Test set------\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

# SVM utility function that gives results by creating file "SVM_avgi_results.txt" in the working
# directory
# Remember that in the hyperparams we are only varying the kernels here from linear to poly to rbf
def SVM_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"SVM_results(doc500).txt")
    f = open(file_path, "w+")
    f.write("Varying the kernels: \n\n")
    kers = ["linear", "poly", "rbf"]
    for k in kers:
        print("Running for {0}".format(k))
        SVM = svm.SVC(C=1, kernel=k)
        SVM.fit(train_avg, train_labels)
        d_preds = SVM.predict(dev_avg)
        Heading = "For kernel: " + k + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-------Dev set------\n"+ d_res + d_metrics)

        t_preds = SVM.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " |" + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("------Test set------\n"+ t_res + t_metrics + "\n\n")
    f.close()

# os.chdir(saved_path)

In [None]:
# Train and get the results for each model

LR_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

In [None]:
RF_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

In [None]:
SVM_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

## doc2vec (dimension = 1000)

In [None]:

saved_path = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\results"
if not os.path.exists(saved_path):
  os.makedirs(saved_path)

path_train = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\train_LJPE_1000_doc2vec.npy"
# path_train_labels = sys.argv[2]
path_test = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\test_LJPE_1000_doc2vec.npy"
# path_test_labels = sys.argv[4]
path_dev = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\doc2vec\val_LJPE_1000_doc2vec.npy"
# path_dev_labels = 'val_single_ILDC_500_DOC2VEC.npy'

x_train=np.load(path_train)
# y_train=np.load(path_train_labels)
x_test=np.load(path_test)
# y_test=np.load(path_test_labels)
x_dev=np.load(path_dev)
# y_dev=np.load(path_dev_labels)

grouped = df.groupby('split')
train_df = grouped.get_group('train')
test_df = grouped.get_group('test')
val_df = grouped.get_group('val')


y_train = train_df['encoded_judgment']
y_test = test_df['encoded_judgment']
y_dev = val_df['encoded_judgment']


#Utility function that calculates the metric scores given the predicted and true labels
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP, FP, FN = [], [], []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))
        
    precision = [i for i in precision if not np.isnan(i)]
    recall = [i for i in recall if not np.isnan(i)]

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

# RF utility function that creates the file RF_results.txt in which we vary the n_estimators parameter from 50 to 500
# with increments of 50
def RF_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"RF_results(doc1000).txt")
    f = open(file_path, "w+")
    f.write("Varying the n_estimators from 50 to 1000\n\n")
    print('Fitting RandomForestClassifier')
    for n_est in tqdm.tqdm(range(50,500,50)):
        clf=RandomForestClassifier(n_estimators=n_est)
        clf.fit(train_avg,train_labels)
        d_preds = clf.predict(dev_avg)
        Heading = "For n_estimators: " + str(n_est) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros: " + str("macro_precision ") +str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-----Dev set----\n"+ d_res + d_metrics)
        
        t_preds = clf.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros: " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("-----Test set-----\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

# LR utility function that creates the file RF_results.txt in which we vary the max_iters parameter from 50 to 500
# with increments of 50
def LR_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"LR_results(doc1000).txt")
    f = open(file_path, "w+")
    print('Fitting LogisticRegression')
    f.write("Varying the max_iters from 50 to 1000\n\n")
    for it in tqdm.tqdm(range(50,1000,50)):
        LR = LogisticRegression(C=1, max_iter =it)
        LR.fit(train_avg, train_labels)
        d_preds = LR.predict(dev_avg)
        Heading = "For max_iters: " + str(it) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-----Dev set-----\n"+ d_res + d_metrics)
        
        t_preds = LR.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("-----Test set-----\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

# SVM utility function that gives results by creating file "SVM_avgi_results.txt" in the working
# directory
# Remember that in the hyperparams we are only varying the kernels here from linear to poly to rbf
def SVM_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"SVM_results(doc1000).txt")
    f = open(file_path, "w+")
    f.write("Varying the kernels: \n\n")
    kers = ["linear", "poly", "rbf"]
    for k in kers:
        print("Running for {0}".format(k))
        SVM = svm.SVC(C=1, kernel=k)
        SVM.fit(train_avg, train_labels)
        d_preds = SVM.predict(dev_avg)
        Heading = "For kernel: " + k + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-----Dev set-----\n"+ d_res + d_metrics)

        t_preds = SVM.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") +  str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") +  str(macro_f1) + "\n"
        f.write("-----Test set-----\n"+ t_res + t_metrics + "\n\n")
    f.close()

In [None]:
# Train and get the results for each model

LR_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

In [None]:
RF_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

In [None]:
SVM_scores(x_train, x_dev, x_test, y_train, y_dev, y_test, saved_path)

## sent2vec

In [None]:

# File saving folder name. 
saved_path = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\results"
if not os.path.exists(saved_path):
  os.makedirs(saved_path)


path_x = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\sent2vec\train_LJPE_200_sent2vec.npy"
path_y = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\sent2vec\test_LJPE_200_sent2vec.npy"
path_z = r"F:\[LJPE] Current Models\Embeddings (Classical Models)\LJPE Embeddings\sent2vec\val_LJPE_200_sent2vec.npy"


# Here we are averaging out the embeddings of all the sentence embeddings of a document.
# We have also tried out max and min pooling as well but they were giving worse results.
# Modify the code here accordingly if you want to try out those methods as well.

def average_out_embeddings(npy_file_path):
    x=np.load(npy_file_path,allow_pickle=True)
    size = x.shape[0]
    x_split_data=x
    # y_split=x[:,1]
    # y_split=y_split.astype('int')
    x_split=np.zeros([size,200])
    for i in range(0,size):
        b=np.zeros([len(list(x_split_data[i])),200])
        for j in range(0,len(x_split_data[i])):
            b[j,:]=x_split_data[i][j]
        b=np.sum(b,axis=0)
        x_split[i,:]=b/len(x_split_data[i])
    return x_split

#prepare data for each set by averaging out sentence embeddings
x_train = average_out_embeddings(path_x)
x_test = average_out_embeddings(path_y)
x_dev = average_out_embeddings(path_z)

grouped = df.groupby('split')
train_df = grouped.get_group('train')
test_df = grouped.get_group('test')
val_df = grouped.get_group('val')


y_train1 = train_df['encoded_judgment']
y_test1 = test_df['encoded_judgment']
y_dev1 = val_df['encoded_judgment']


#Utility function that calculates the metric scores given the predicted and true labels
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP, FP, FN = [], [], []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    precision = [i for i in precision if not np.isnan(i)]
    recall = [i for i in recall if not np.isnan(i)]

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    # print(macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1


# SVM utility function that gives results by creating file "SVM_avgi_results.txt" in the working
# directory
# Remember that in the hyperparams we are only varying the kernels here from linear to poly to rbf
def SVM_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"SVM_avgi_results(sent200).txt")
    f = open(file_path, "w+")
    f.write("Varying the kernels: \n\n")
    kers = ["linear", "poly", "rbf"]
    for k in kers:
        print("Running for {0}".format(k))
        SVM = svm.SVC(C=1, kernel=k)
        SVM.fit(train_avg, train_labels)
        d_preds = SVM.predict(dev_avg)
        Heading = "For kernel: " + k + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "------Dev set-----\n"+ d_res + d_metrics)

        t_preds = SVM.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " +  str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("-----Test set-----\n"+ t_res + t_metrics + "\n\n")
    f.close()

# RF utility function that creates the file RF_results.txt in which we vary the n_estimators parameter from 50 to 1000
# with increments of 50
def RF_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"RF_results(sent200).txt")
    f = open(file_path, "w+")
    f.write("Varying the n_estimators from 50 to 1000\n\n")
    print('Fitting RandomForestClassifier')
    for n_est in tqdm.tqdm(range(50,1000,50)):
        clf=RandomForestClassifier(n_estimators=n_est)
        clf.fit(train_avg,train_labels)
        d_preds = clf.predict(dev_avg)
        Heading = "For n_estimators: " + str(n_est) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros ->" + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "----Dev set----\n"+ d_res + d_metrics)
        
        t_preds = clf.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) + " | " + str("macro_f1 ") +  str(macro_f1) + "\n"
        f.write("----Test set----\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

# LR utility function that creates the file RF_results.txt in which we vary the max_iters parameter from 50 to 1000
# with increments of 50
def LR_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels,saved_path):
    file_path = os.path.join(saved_path,"LR_results(sent200).txt")
    f = open(file_path, "w+")
    f.write("Varying the max_iters from 50 to 1000\n\n")
    print('Fitting LogisticRegression')
    for it in tqdm.tqdm(range(50,1000,50)):
        LR = LogisticRegression(C=1, max_iter =it)
        LR.fit(train_avg, train_labels)
        d_preds = LR.predict(dev_avg)
        Heading = "For max_iters: " + str(it) + "\n"
        d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
        d_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) +  " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write(Heading + "-----Dev set-----\n"+ d_res + d_metrics)
        
        t_preds = LR.predict(test_avg)
        t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
        macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
        t_metrics = "Micros -> " + str("micro_precision ") + str(micro_precision) + " | " + str("micro_recall ") + str(micro_recall) + " | " + str("micro_f1 ") + str(micro_f1) + " \nMacros -> " + str("macro_precision ") + str(macro_precision) + " | " + str("macro_recall ") + str(macro_recall) +  " | " + str("macro_f1 ") + str(macro_f1) + "\n"
        f.write("----Test set----\n"+ t_res + t_metrics + "\n\n")
        
    f.close()

In [None]:
# Train and get the results for each model
LR_scores(x_train, x_dev, x_test, y_train1, y_dev1, y_test1, saved_path)

In [None]:
RF_scores(x_train, x_dev, x_test, y_train1, y_dev1, y_test1, saved_path)

In [None]:
SVM_scores(x_train, x_dev, x_test, y_train1, y_dev1, y_test1, saved_path)