# Basic Setup

In [None]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle

from sklearn import model_selection
from sklearn import ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

from sklearn.model_selection import cross_val_score

# Load in Data

In [None]:
train = pd.read_csv("../../N2FL NLP/data/03a_sst5_train.csv", sep=",", encoding='utf-8').drop(columns=["Unnamed: 0", "text"])

In [None]:
test = pd.read_csv("../../N2FL NLP/data/03a_sst5_test.csv", sep=",", encoding='utf-8').drop(columns=["Unnamed: 0", "text"])

# Clean Data for Algorithm

### Make dummy variables from predictions

In [None]:
forest_train = pd.get_dummies(train, prefix=['yelp_', 'xlnet_', 'albert_', 'stanza_', 'bert_', 'twit_', 'imdb_'], columns=['label_predict_yelp', 'label_predict_xlnet', 'label_predict_albert', 'label_predict_stanza', 'label_predict_bert', 'label_predict_twit', 'label_predict_imdb'])

In [None]:
forest_train_x = forest_train.copy().drop(columns=["label"])

In [None]:
forest_train_y = forest_train["label"].copy()

In [None]:
forest_test = pd.get_dummies(test, prefix=['yelp_', 'xlnet_', 'albert_', 'stanza_', 'bert_', 'twit_', 'imdb_'], columns=['label_predict_yelp', 'label_predict_xlnet', 'label_predict_albert', 'label_predict_stanza', 'label_predict_bert', 'label_predict_twit', 'label_predict_imdb'])

In [None]:
forest_test_x = forest_test.copy().drop(columns=["label"])

In [None]:
forest_test_y = forest_test["label"].copy()

### Add additional predictors

In [None]:
def add_predictors(df):
    start_time = time.time()
    
    tmp = pd.DataFrame(np.sort(df[["LABEL_0_yelp", "LABEL_1_yelp", "LABEL_2_yelp", "LABEL_3_yelp", "LABEL_4_yelp"]].values))
    df["label_prob2_yelp"] = tmp.iloc[:,-2].values
    df["label_prob_diff_yelp"] = df["label_prob_yelp"] - df["label_prob2_yelp"]

    df["label_prob_diff_xlnet"] = (df["LABEL_0_xlnet"] - df["LABEL_4_xlnet"]).abs()

    df["label_prob_diff_albert"] = (df["LABEL_0_albert"] - df["LABEL_4_albert"]).abs()

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_bert", "LABEL_1_bert", "LABEL_2_bert", "LABEL_3_bert", "LABEL_4_bert"]].values))
    df["label_prob2_bert"] = tmp.iloc[:,-2].values
    df["label_prob_diff_bert"] = df["label_prob_bert"] - df["label_prob2_bert"]

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_twit", "LABEL_2_twit", "LABEL_4_twit"]].values))
    df["label_prob2_twit"] = tmp.iloc[:,-2].values
    df["label_prob_diff_twit"] = df["label_prob_twit"] - df["label_prob2_twit"]
    
    print("--- %s seconds ---" % (time.time() - start_time))

    return df

In [None]:
forest_train_x = add_predictors(forest_train_x)
forest_test_x = add_predictors(forest_test_x)

In [None]:
forest_train_x.head()

### Make SST2 dataset

In [None]:
# Make an SST2 dataset just for validation purposes
sst2_train = forest_train.copy()
sst2_train.loc[sst2_train.label == "LABEL_1", "label"] = "LABEL_0"
sst2_train.loc[sst2_train.label == "LABEL_3", "label"] = "LABEL_4"

sst2_train = sst2_train[sst2_train['label']!="LABEL_2"].copy()

sst2_train_x = sst2_train.copy().drop(columns=["label"])
sst2_train_y = sst2_train["label"].copy()

sst2_test = forest_test.copy()
sst2_test.loc[sst2_test.label == "LABEL_1", "label"] = "LABEL_0"
sst2_test.loc[sst2_test.label == "LABEL_3", "label"] = "LABEL_4"

sst2_test = sst2_test[sst2_test['label']!="LABEL_2"].copy()

sst2_test_x = sst2_test.copy().drop(columns=["label"])
sst2_test_y = sst2_test["label"].copy()

# Train the algorithm

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = ensemble.RandomForestClassifier()

In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 4, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model
rf_random.fit(forest_train_x, forest_train_y)

In [None]:
rf_random.best_params_

In [None]:
rfc = ensemble.RandomForestClassifier(bootstrap= True, max_depth= 70, max_features= 'auto', min_samples_leaf= 4, min_samples_split= 2, n_estimators= 1800, random_state=12345)
rfc.fit(forest_train_x, forest_train_y)

In [None]:
rfc_predict_train = rfc.predict(forest_train_x)

In [None]:
rfc_predict_test = rfc.predict(forest_test_x)

# Evaluate performance

### Basic accuracy checks

In [None]:
# Function to quickly and visually display the accuracy metrics of the sentiment
# analysis algorithm in a confusion matrix and standard measures
#
# ARGS:
# truevals - Dataframe column of true values
# predictvals - Dataframe column of predicted values

def prediction_test(truevals, predictvals):
    sns.set(font_scale=1.2, rc={'figure.figsize':(11.7,8.27)})
    cm = confusion_matrix(truevals, predictvals)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Greens', fmt='g')
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')

    print(classification_report(truevals, predictvals))

In [None]:
prediction_test(forest_train_y, rfc_predict_train)
plt.savefig('../../N2FL NLP/output/03b_forest_train_accuracy_5class.png')

In [None]:
prediction_test(forest_test_y, rfc_predict_test)
plt.savefig('../../N2FL NLP/output/03b_forest_test_accuracy_5class.png')

### Run some quick calculations to see other metrics of accuracy

In [None]:
# test one-off accuracy
test = {"truth": forest_test_y, "prediction": rfc_predict_test}
test = pd.DataFrame(test)

In [None]:
test["truth"] = test["truth"].str.slice(start=6).astype(float)
test["prediction"] = test["prediction"].str.slice(start=6).astype(float)

In [None]:
test.head()

In [None]:
test["error"] = abs(test["truth"] - test["prediction"])

In [None]:
test["error"][test["error"]<=1].shape[0] / 2210

In [None]:
test2 = test.copy()

In [None]:
test2["truth"][test2["truth"]==1] = 0
test2["truth"][test2["truth"]==3] = 4
test2["prediction"][test2["prediction"]==1] = 0
test2["prediction"][test2["prediction"]==3] = 4

In [None]:
test2["error"] = abs(test2["truth"] - test2["prediction"])

In [None]:
test2["error"][test2["error"]==0].shape[0] / 2210

In [None]:
prediction_test(test2["truth"], test2["prediction"])
plt.savefig('../../N2FL NLP/output/03b_forest_test_accuracy_3class.png')

### Output the model for use in next script

In [None]:
pickle.dump(rfc, open("../../N2FL NLP/data/rfc.sav", "wb"))