# Basic Setup

In [None]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle

from sklearn import model_selection
from sklearn import ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

from sklearn.model_selection import cross_val_score

# Load in Data

In [None]:
rfc = pickle.load(open("../../N2FL NLP/data/03b_rfc.sav", "rb"))

In [None]:
data = pd.read_csv("../../N2FL NLP/data/03a_sentiment_prep_masked.csv", sep=",", encoding='utf-8').drop(columns=["Unnamed: 0", "index"])

In [None]:
data.head()

In [None]:
data.shape

# Clean Data for Algorithm

### Make dummy variables from predictions

In [None]:
data_x = pd.get_dummies(data, prefix=['yelp_', 'xlnet_', 'albert_', 'stanza_', 'bert_', 'twit_', 'imdb_'], columns=['label_predict_yelp', 'label_predict_xlnet', 'label_predict_albert', 'label_predict_stanza', 'label_predict_bert', 'label_predict_twit', 'label_predict_imdb']).drop(columns=["text"])

### Add additional predictors

In [None]:
def add_predictors(df):
    start_time = time.time()
    
    tmp = pd.DataFrame(np.sort(df[["LABEL_0_yelp", "LABEL_1_yelp", "LABEL_2_yelp", "LABEL_3_yelp", "LABEL_4_yelp"]].values))
    df["label_prob2_yelp"] = tmp.iloc[:,-2].values
    df["label_prob_diff_yelp"] = df["label_prob_yelp"] - df["label_prob2_yelp"]

    df["label_prob_diff_xlnet"] = (df["LABEL_0_xlnet"] - df["LABEL_4_xlnet"]).abs()

    df["label_prob_diff_albert"] = (df["LABEL_0_albert"] - df["LABEL_4_albert"]).abs()

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_bert", "LABEL_1_bert", "LABEL_2_bert", "LABEL_3_bert", "LABEL_4_bert"]].values))
    df["label_prob2_bert"] = tmp.iloc[:,-2].values
    df["label_prob_diff_bert"] = df["label_prob_bert"] - df["label_prob2_bert"]

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_twit", "LABEL_2_twit", "LABEL_4_twit"]].values))
    df["label_prob2_twit"] = tmp.iloc[:,-2].values
    df["label_prob_diff_twit"] = df["label_prob_twit"] - df["label_prob2_twit"]
    
    print("--- %s seconds ---" % (time.time() - start_time))

    return df

In [None]:
data_x = add_predictors(data_x)

In [None]:
data_x.head()

# Apply to analytic data

In [None]:
start_time = time.time()
rfc_predict_data = rfc.predict(data_x)
rfc_predict_data = pd.Series(rfc_predict_data)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
probabilities = rfc.predict_proba(data_x)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
predictions = {'merge': rfc_predict_data.index,'label_predict_forest': rfc_predict_data, 'LABEL_0_forest': probabilities[:,0], 'LABEL_1_forest': probabilities[:,1], 'LABEL_2_forest': probabilities[:,2], 'LABEL_3_forest': probabilities[:,3], 'LABEL_4_forest': probabilities[:,4]}

In [None]:
predictions = pd.DataFrame(predictions)

In [None]:
data_x['merge'] = np.arange(len(data_x))

In [None]:
data_x['index'] = data_x.index

In [None]:
data_x['text'] = data['text']

In [None]:
output_prep = pd.merge(data_x, predictions, how="left", on="merge", validate="1:1")

In [None]:
output_prep.head(20)

In [None]:
output_prep.to_csv('../../N2FL NLP/data/03c_forest_output_masked.csv')