# Basic Setup

In [1]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle

from sklearn import model_selection
from sklearn import ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

from sklearn.model_selection import cross_val_score

# Load in Data

In [2]:
rfc = pickle.load(open("../../N2FL NLP/data/03b_rfc.sav", "rb"))

In [3]:
data = pd.read_csv("../../N2FL NLP/data/03a_sentiment_prep_masked.csv", sep=",", encoding='utf-8').drop(columns=["Unnamed: 0", "index"])

In [4]:
data.head()

Unnamed: 0,text,LABEL_0_yelp,LABEL_1_yelp,LABEL_2_yelp,LABEL_3_yelp,LABEL_4_yelp,label_predict_yelp,label_prob_yelp,LABEL_0_xlnet,LABEL_4_xlnet,...,LABEL_3_bert,LABEL_4_bert,label_predict_bert,label_prob_bert,LABEL_0_twit,LABEL_2_twit,LABEL_4_twit,label_predict_twit,label_prob_twit,label_predict_imdb
0,"Hi, I'm a KCC advisor, & I'm here to support y...",0.166542,0.060218,0.059894,0.112819,0.600526,LABEL_4,0.600526,0.372121,0.627879,...,0.292423,0.418517,LABEL_4,0.418517,0.003276,0.202024,0.7947,LABEL_4,0.7947,LABEL_4
1,Hi. Fafsa.gov is open for the 2018-2019 school...,0.165248,0.081339,0.162222,0.241889,0.349302,LABEL_4,0.349302,0.464095,0.535905,...,0.213115,0.210243,LABEL_3,0.213115,0.008751,0.522877,0.468372,LABEL_2,0.522877,LABEL_4
2,Hi. I'm at Austin Community College & I'm here...,0.132794,0.056493,0.068621,0.154709,0.587383,LABEL_4,0.587383,0.181368,0.818632,...,0.25706,0.355172,LABEL_4,0.355172,0.002864,0.240926,0.756209,LABEL_4,0.756209,LABEL_4
3,Hi. Spring 2019 registration is open. Can I he...,0.239166,0.107717,0.229652,0.191002,0.232463,LABEL_0,0.239166,0.671333,0.328667,...,0.118283,0.076205,LABEL_0,0.381713,0.015357,0.880666,0.103976,LABEL_2,0.880666,LABEL_4
4,"Hi. Some students face financial hardships, in...",0.03895,0.082841,0.284229,0.349994,0.243986,LABEL_3,0.349994,0.041506,0.958494,...,0.157046,0.036299,LABEL_2,0.416131,0.180527,0.701489,0.117984,LABEL_2,0.701489,LABEL_4


In [5]:
data.shape

(52164, 30)

# Clean Data for Algorithm

### Make dummy variables from predictions

In [6]:
data_x = pd.get_dummies(data, prefix=['yelp_', 'xlnet_', 'albert_', 'stanza_', 'bert_', 'twit_', 'imdb_'], columns=['label_predict_yelp', 'label_predict_xlnet', 'label_predict_albert', 'label_predict_stanza', 'label_predict_bert', 'label_predict_twit', 'label_predict_imdb']).drop(columns=["text"])

### Add additional predictors

In [7]:
def add_predictors(df):
    start_time = time.time()
    
    tmp = pd.DataFrame(np.sort(df[["LABEL_0_yelp", "LABEL_1_yelp", "LABEL_2_yelp", "LABEL_3_yelp", "LABEL_4_yelp"]].values))
    df["label_prob2_yelp"] = tmp.iloc[:,-2].values
    df["label_prob_diff_yelp"] = df["label_prob_yelp"] - df["label_prob2_yelp"]

    df["label_prob_diff_xlnet"] = (df["LABEL_0_xlnet"] - df["LABEL_4_xlnet"]).abs()

    df["label_prob_diff_albert"] = (df["LABEL_0_albert"] - df["LABEL_4_albert"]).abs()

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_bert", "LABEL_1_bert", "LABEL_2_bert", "LABEL_3_bert", "LABEL_4_bert"]].values))
    df["label_prob2_bert"] = tmp.iloc[:,-2].values
    df["label_prob_diff_bert"] = df["label_prob_bert"] - df["label_prob2_bert"]

    tmp = pd.DataFrame(np.sort(df[["LABEL_0_twit", "LABEL_2_twit", "LABEL_4_twit"]].values))
    df["label_prob2_twit"] = tmp.iloc[:,-2].values
    df["label_prob_diff_twit"] = df["label_prob_twit"] - df["label_prob2_twit"]
    
    print("--- %s seconds ---" % (time.time() - start_time))

    return df

In [8]:
data_x = add_predictors(data_x)

--- 0.03300762176513672 seconds ---


In [9]:
data_x.head()

Unnamed: 0,LABEL_0_yelp,LABEL_1_yelp,LABEL_2_yelp,LABEL_3_yelp,LABEL_4_yelp,label_prob_yelp,LABEL_0_xlnet,LABEL_4_xlnet,label_prob_xlnet,LABEL_0_albert,...,imdb__LABEL_2,imdb__LABEL_4,label_prob2_yelp,label_prob_diff_yelp,label_prob_diff_xlnet,label_prob_diff_albert,label_prob2_bert,label_prob_diff_bert,label_prob2_twit,label_prob_diff_twit
0,0.166542,0.060218,0.059894,0.112819,0.600526,0.600526,0.372121,0.627879,0.627879,0.414869,...,0,1,0.166542,0.433985,0.255759,0.170262,0.292423,0.126095,0.202024,0.592675
1,0.165248,0.081339,0.162222,0.241889,0.349302,0.349302,0.464095,0.535905,0.535905,0.837673,...,0,1,0.241889,0.107413,0.071811,0.675346,0.212943,0.000173,0.468372,0.054505
2,0.132794,0.056493,0.068621,0.154709,0.587383,0.587383,0.181368,0.818632,0.818632,0.416723,...,0,1,0.154709,0.432674,0.637265,0.166555,0.25706,0.098112,0.240926,0.515283
3,0.239166,0.107717,0.229652,0.191002,0.232463,0.239166,0.671333,0.328667,0.671333,0.893488,...,0,1,0.232463,0.006703,0.342666,0.786975,0.213234,0.168478,0.103976,0.77669
4,0.03895,0.082841,0.284229,0.349994,0.243986,0.349994,0.041506,0.958494,0.958494,0.077866,...,0,1,0.284229,0.065765,0.916987,0.844268,0.290083,0.126048,0.180527,0.520963


# Apply to analytic data

In [10]:
start_time = time.time()
rfc_predict_data = rfc.predict(data_x)
rfc_predict_data = pd.Series(rfc_predict_data)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8.126530408859253 seconds ---


In [11]:
start_time = time.time()
probabilities = rfc.predict_proba(data_x)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8.166585445404053 seconds ---


In [12]:
predictions = {'merge': rfc_predict_data.index,'label_predict_forest': rfc_predict_data, 'LABEL_0_forest': probabilities[:,0], 'LABEL_1_forest': probabilities[:,1], 'LABEL_2_forest': probabilities[:,2], 'LABEL_3_forest': probabilities[:,3], 'LABEL_4_forest': probabilities[:,4]}

In [13]:
predictions = pd.DataFrame(predictions)

In [14]:
data_x['merge'] = np.arange(len(data_x))

In [15]:
data_x['index'] = data_x.index

In [16]:
data_x['text'] = data['text']

In [17]:
output_prep = pd.merge(data_x, predictions, how="left", on="merge", validate="1:1")

In [18]:
output_prep.head(20)

Unnamed: 0,LABEL_0_yelp,LABEL_1_yelp,LABEL_2_yelp,LABEL_3_yelp,LABEL_4_yelp,label_prob_yelp,LABEL_0_xlnet,LABEL_4_xlnet,label_prob_xlnet,LABEL_0_albert,...,label_prob_diff_twit,merge,index,text,label_predict_forest,LABEL_0_forest,LABEL_1_forest,LABEL_2_forest,LABEL_3_forest,LABEL_4_forest
0,0.166542,0.060218,0.059894,0.112819,0.600526,0.600526,0.372121,0.627879,0.627879,0.414869,...,0.592675,0,0,"Hi, I'm a KCC advisor, & I'm here to support y...",LABEL_2,0.004474,0.060419,0.616688,0.290658,0.027761
1,0.165248,0.081339,0.162222,0.241889,0.349302,0.349302,0.464095,0.535905,0.535905,0.837673,...,0.054505,1,1,Hi. Fafsa.gov is open for the 2018-2019 school...,LABEL_2,0.04246,0.269455,0.610622,0.064863,0.0126
2,0.132794,0.056493,0.068621,0.154709,0.587383,0.587383,0.181368,0.818632,0.818632,0.416723,...,0.515283,2,2,Hi. I'm at Austin Community College & I'm here...,LABEL_2,0.003592,0.052618,0.651532,0.259046,0.033212
3,0.239166,0.107717,0.229652,0.191002,0.232463,0.239166,0.671333,0.328667,0.671333,0.893488,...,0.77669,3,3,Hi. Spring 2019 registration is open. Can I he...,LABEL_2,0.044704,0.225766,0.66047,0.065249,0.003811
4,0.03895,0.082841,0.284229,0.349994,0.243986,0.349994,0.041506,0.958494,0.958494,0.077866,...,0.520963,4,4,"Hi. Some students face financial hardships, in...",LABEL_2,0.001234,0.007653,0.479607,0.458919,0.052587
5,0.004992,0.00611,0.026793,0.17481,0.787294,0.787294,0.002489,0.997511,0.997511,0.093143,...,0.858425,5,5,Hi! Doing well in classes can bring you closer...,LABEL_3,0.000841,0.02178,0.197206,0.525291,0.254882
6,0.153771,0.385881,0.396226,0.054815,0.009307,0.396226,0.977339,0.022661,0.977339,0.970786,...,0.206855,6,6,"Hi, I know college can be expensive. Do you ne...",LABEL_1,0.06348,0.534085,0.397875,0.00456,0.0
7,0.446299,0.109859,0.082721,0.092623,0.268498,0.446299,0.912241,0.087759,0.912241,0.921018,...,0.615377,7,7,"Hi! Did you know 242,000+ Texans filed FAFSA o...",LABEL_1,0.107104,0.445973,0.415981,0.03041,0.000533
8,0.266856,0.472069,0.238851,0.017593,0.004631,0.472069,0.242447,0.757553,0.757553,0.908015,...,0.781866,8,8,"Hey. With finals coming up, I wanted to check ...",LABEL_2,0.030094,0.168785,0.760792,0.033816,0.006512
9,0.089156,0.142221,0.302377,0.266265,0.199981,0.302377,0.509135,0.490865,0.509135,0.834121,...,0.494318,9,9,"Hi, doing well on exams brings you closer to y...",LABEL_2,0.040178,0.311816,0.523808,0.120025,0.004173


In [19]:
output_prep.to_csv('../../N2FL NLP/data/03c_forest_output_masked.csv')