# Basic Setup


### Import libraries

In [None]:
import os
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn import model_selection
from sklearn import ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

from sklearn.model_selection import cross_val_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
# likely will also require installation of the sentencepiece library

import stanza
# Will require downloading their English pre-trained models using the below command:
# stanza.download('en')

In [None]:
import pytorch

### Check that GPU loaded correctly

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# Begin creating NLP sentiment analysis pipeline

### Define various sentiment analysis models

In [None]:
# https://huggingface.co/gilf/english-yelp-sentiment
yelp_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("gilf/english-yelp-sentiment"), tokenizer = AutoTokenizer.from_pretrained("gilf/english-yelp-sentiment"), device = 0, return_all_scores=True)

In [None]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
bert_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment"), tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment"), device = 0, return_all_scores=True)

In [None]:
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
twit_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"), tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", model_max_length=500, truncation=True), device = 0, return_all_scores=True)

In [None]:
# https://huggingface.co/textattack/xlnet-large-cased-SST-2
xlnet_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("textattack/xlnet-base-cased-SST-2"), tokenizer = AutoTokenizer.from_pretrained("textattack/xlnet-base-cased-SST-2"), device = 0, return_all_scores=True)

In [None]:
# https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment
imdb_all = pipeline(task="text2text-generation", model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment"), tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment"), device = 0)

In [None]:
# https://huggingface.co/textattack/albert-base-v2-SST-2
albert_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("textattack/albert-base-v2-SST-2"), tokenizer = AutoTokenizer.from_pretrained("textattack/albert-base-v2-SST-2", use_fast=False), device = 0, return_all_scores=True)

In [None]:
stanza_top = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

### Define pipeline for running sentiment analysis and charting output

In [None]:
# Function to run the sentiment analysis algorithm on a specific string of text
# and input the results into a tidy series
#
# ARGS:
# text - String of text to be analyzed
# model - one of "yelp", "bert", "twit", "xlnet", "albert", or "stanza"

def save_sentiment(text, model):
    # Run analyses and save results
    if model=="yelp":
        temp_all = yelp_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_1': temp_all[0][1]["score"], 'LABEL_2': temp_all[0][2]["score"], 'LABEL_3': temp_all[0][3]["score"], 'LABEL_4': temp_all[0][4]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)
    
    elif model=="bert":
        temp_all = bert_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_1': temp_all[0][1]["score"], 'LABEL_2': temp_all[0][2]["score"], 'LABEL_3': temp_all[0][3]["score"], 'LABEL_4': temp_all[0][4]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="twit":
        temp_all = twit_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_2': temp_all[0][1]["score"], 'LABEL_4': temp_all[0][2]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="xlnet":
        temp_all = xlnet_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_4': temp_all[0][1]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="imdb":
        temp_all = imdb_all(text)

        # Output results into a dataframe
        if temp_all[0]["generated_text"]=="negative":
            d = pd.Series({'label_predict': "LABEL_0"})
        elif temp_all[0]["generated_text"]=="positive":
            d = pd.Series({'label_predict': "LABEL_4"})
        else:
            d = pd.Series({'label_predict': "LABEL_2"})

    elif model=="albert":
        temp_all = albert_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_4': temp_all[0][1]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)
    
    elif model=="stanza":
        temp = stanza_top(text)
        for i, sentence in enumerate(temp.sentences):
            if sentence.sentiment==0:
                d = pd.Series({'label_predict': "LABEL_0"})
            elif sentence.sentiment==1:
                d = pd.Series({'label_predict': "LABEL_2"})
            elif sentence.sentiment==2:
                d = pd.Series({'label_predict': "LABEL_4"})

    # Print it
    return d
  

In [None]:
def get_sentiment(dataset, text_column, model):
    start_time = time.time()

    temp = dataset[text_column].apply(save_sentiment, model=model)
    print("--- %s seconds ---" % (time.time() - start_time))
    temp = pd.concat([dataset, temp], axis=1)
    return temp

In [None]:
# Function to quickly and visually display the accuracy metrics of the sentiment
# analysis algorithm in a confusion matrix and standard measures
#
# ARGS:
# truevals - Dataframe column of true values
# predictvals - Dataframe column of predicted values

def prediction_test(truevals, predictvals):
    sns.set(font_scale=1.2, rc={'figure.figsize':(11.7,8.27)})
    cm = confusion_matrix(truevals, predictvals)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Greens', fmt='g')
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')

    print(classification_report(truevals, predictvals))

# Apply to Stanford Sentiment Treebank and Benchmark

### Load in Stanford Sentiment Treebank to benchmark performance

In [None]:
sst5_train = pd.read_csv("../../N2FL NLP/data/train.tsv", sep="\t", encoding='utf-8')
# source: https://raw.githubusercontent.com/brhkim/Albert-Sentiment-Analysis/master/data/train.tsv
sst5_test = pd.read_csv("../../N2FL NLP/data/test.tsv", sep="\t", encoding='utf-8')
# source: https://raw.githubusercontent.com/brhkim/Albert-Sentiment-Analysis/master/data/test.tsv

In [None]:
# Basic relabeling to mesh with algorithm output
sst5_train.loc[sst5_train.label == "very neg", "label"] = "LABEL_0"
sst5_train.loc[sst5_train.label == "neg", "label"] = "LABEL_1"
sst5_train.loc[sst5_train.label == "neu", "label"] = "LABEL_2"
sst5_train.loc[sst5_train.label == "pos", "label"] = "LABEL_3"
sst5_train.loc[sst5_train.label == "very pos", "label"] = "LABEL_4"

In [None]:
sst5_test.loc[sst5_test.label == "very neg", "label"] = "LABEL_0"
sst5_test.loc[sst5_test.label == "neg", "label"] = "LABEL_1"
sst5_test.loc[sst5_test.label == "neu", "label"] = "LABEL_2"
sst5_test.loc[sst5_test.label == "pos", "label"] = "LABEL_3"
sst5_test.loc[sst5_test.label == "very pos", "label"] = "LABEL_4"

In [None]:
# Preview test dataset
sst5_test.sample(5, random_state=1234)

In [None]:
# Make a 3-class SST dataset
sst3_train = sst5_train.copy()
sst3_train.loc[sst3_train.label == "LABEL_1", "label"] = "LABEL_0"
sst3_train.loc[sst3_train.label == "LABEL_3", "label"] = "LABEL_4"

sst3_test = sst5_test.copy()
sst3_test.loc[sst3_test.label == "LABEL_1", "label"] = "LABEL_0"
sst3_test.loc[sst3_test.label == "LABEL_3", "label"] = "LABEL_4"

In [None]:
sst3_test.sample(5, random_state=1234)

In [None]:
# Make a 2-class SST dataset
sst2_train = sst3_train[sst3_train['label']!="LABEL_2"].copy()
sst2_test = sst3_test[sst3_test['label']!="LABEL_2"].copy()

# Run and evaluate various sentiment analysis pipelines on SST data

### Yelp BERT model

In [None]:
yelp_test = get_sentiment(sst5_test, "text", "yelp")

In [None]:
yelp_test.sample(5, random_state=1236)

In [None]:
prediction_test(yelp_test["label"], yelp_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_yelp_test.png')

### Stanza Sentiment Analysis CNN

In [None]:
stanza_test = get_sentiment(sst3_test, "text", "stanza")

In [None]:
stanza_test.sample(5, random_state=1237)

In [None]:
prediction_test(stanza_test["label"], stanza_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_stanza_test.png')

### TextAttack Albert Model

In [None]:
albert_test = get_sentiment(sst2_test, "text", "albert")

In [None]:
albert_test.head()

In [None]:
prediction_test(albert_test["label"], albert_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_albert_test.png')

### TextAttack XLNet Model

In [None]:
xlnet_test = get_sentiment(sst2_test, "text", "xlnet")

In [None]:
xlnet_test.head()

In [None]:
prediction_test(xlnet_test["label"], xlnet_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_xlnet_test.png')

### Multilingual Bert

In [None]:
bert_test = get_sentiment(sst5_test, "text", "bert")

In [None]:
bert_test.head()

In [None]:
prediction_test(bert_test["label"], bert_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_bert_test.png')

### Twitter Roberta

In [None]:
twit_test = get_sentiment(sst3_test, "text", "twit")

In [None]:
twit_test.head()

In [None]:
prediction_test(twit_test["label"], twit_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_twit_test.png')

### T5 IMDB

In [None]:
imdb_test = get_sentiment(sst2_test, "text", "imdb")

In [None]:
imdb_test.head()

In [None]:
imdb_test[imdb_test["label_predict"]=="LABEL_2"]

In [None]:
prediction_test(imdb_test["label"], imdb_test["label_predict"])
plt.savefig('../../N2FL NLP/output/03a_imdb_test.png')

# Create datasets for training the ensemble classifier

### Create training data

In [None]:
yelp_train = get_sentiment(sst5_train, "text", "yelp").drop(columns=["text", "label"]).add_suffix("_yelp")

In [None]:
xlnet_train = get_sentiment(sst5_train, "text", "xlnet").drop(columns=["text", "label"]).add_suffix("_xlnet")

In [None]:
albert_train = get_sentiment(sst5_train, "text", "albert").drop(columns=["text", "label"]).add_suffix("_albert")

In [None]:
stanza_train = get_sentiment(sst5_train, "text", "stanza").drop(columns=["text", "label"]).add_suffix("_stanza")

In [None]:
bert_train = get_sentiment(sst5_train, "text", "bert").drop(columns=["text", "label"]).add_suffix("_bert")

In [None]:
twit_train = get_sentiment(sst5_train, "text", "twit").drop(columns=["text", "label"]).add_suffix("_twit")

In [None]:
imdb_train = get_sentiment(sst5_train, "text", "imdb").drop(columns=["text", "label"]).add_suffix("_imdb")

#### Merge datasets together

In [None]:
forest_train = pd.merge(sst5_train, yelp_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, xlnet_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, albert_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, stanza_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, bert_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, twit_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train = pd.merge(forest_train, imdb_train, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_train.to_csv('../../N2FL NLP/data/03a_sst5_train.csv')

### Create testing data

In [None]:
yelp_test = get_sentiment(sst5_test, "text", "yelp").drop(columns=["text", "label"]).add_suffix("_yelp")

In [None]:
xlnet_test = get_sentiment(sst5_test, "text", "xlnet").drop(columns=["text", "label"]).add_suffix("_xlnet")

In [None]:
albert_test = get_sentiment(sst5_test, "text", "albert").drop(columns=["text", "label"]).add_suffix("_albert")

In [None]:
stanza_test = get_sentiment(sst5_test, "text", "stanza").drop(columns=["text", "label"]).add_suffix("_stanza")

In [None]:
bert_test = get_sentiment(sst5_test, "text", "bert").drop(columns=["text", "label"]).add_suffix("_bert")

In [None]:
twit_test = get_sentiment(sst5_test, "text", "twit").drop(columns=["text", "label"]).add_suffix("_twit")

In [None]:
imdb_test = get_sentiment(sst5_test, "text", "imdb").drop(columns=["text", "label"]).add_suffix("_imdb")

#### Merge datasets together

In [None]:
forest_test = pd.merge(sst5_test, yelp_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, xlnet_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, albert_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, stanza_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, bert_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, twit_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test = pd.merge(forest_test, imdb_test, how="inner", left_index=True, right_index=True, validate="1:1")

In [None]:
forest_test.to_csv('../../N2FL NLP/data/03a_sst5_test.csv')

# Apply to the analytic dataset

### Load in and organize the analytic data

In [None]:
data = pd.read_stata('../../N2FL NLP/data/02_advisornames_cleaned_unique.dta')
data["index"]=data.index
data.shape

In [None]:
data.head()

### Apply the Sentiment Analysis algorithms to the analytic data

In [None]:
data_yelp = get_sentiment(data, "text", "yelp").add_suffix("_yelp")

In [None]:
data_xlnet = get_sentiment(data, "text", "xlnet").add_suffix("_xlnet")

In [None]:
data_albert = get_sentiment(data, "text", "albert").add_suffix("_albert")

In [None]:
# Minor cleaning steps because Stanza can't take empty rows
stanza_prep = data.copy()
stanza_prep["str_len"] = stanza_prep["text"].str.len()
stanza_prep = stanza_prep[stanza_prep['str_len'] >= 2].drop(columns=["str_len"])

In [None]:
data_stanza = get_sentiment(stanza_prep, "text", "stanza").add_suffix("_stanza")

In [None]:
data_bert = get_sentiment(data, "text", "bert").add_suffix("_bert")

In [None]:
# Minor cleaning steps because there's a max string length
twit_prep = data.copy()
twit_prep["str_len"] = twit_prep["text"].str.len()
twit_prep = twit_prep[twit_prep['str_len'] <= 400].drop(columns=["str_len"])

In [None]:
data_twit = get_sentiment(twit_prep, "text", "twit").add_suffix("_twit")

In [None]:
# Minor cleaning steps because imdb can't take non-ascii characters apparently
imdb_prep = data.copy()
imdb_prep["text"] = imdb_prep["text"].str.encode("ascii", "ignore").str.decode("ascii")

In [None]:
data_imdb = get_sentiment(imdb_prep, "text", "imdb").add_suffix("_imdb")

### Join sentiment scores back into the main dataset

In [None]:
data_analyze = pd.merge(data, data_yelp, how="inner", left_on="index", right_on="index_yelp", validate="1:1").drop(columns=["text_yelp", "index_yelp"])

In [None]:
data_analyze = pd.merge(data_analyze, data_xlnet, how="inner", left_on="index", right_on="index_xlnet", validate="1:1").drop(columns=["text_xlnet", "index_xlnet"])

In [None]:
data_analyze = pd.merge(data_analyze, data_albert, how="inner", left_on="index", right_on="index_albert", validate="1:1").drop(columns=["text_albert", "index_albert"])

In [None]:
data_analyze = pd.merge(data_analyze, data_stanza, how="inner", left_on="index", right_on="index_stanza", validate="1:1").drop(columns=["text_stanza", "index_stanza"])

In [None]:
data_analyze = pd.merge(data_analyze, data_bert, how="inner", left_on="index", right_on="index_bert", validate="1:1").drop(columns=["text_bert", "index_bert"])

In [None]:
data_analyze = pd.merge(data_analyze, data_twit, how="inner", left_on="index", right_on="index_twit", validate="1:1").drop(columns=["text_twit", "index_twit"])

In [None]:
data_analyze = pd.merge(data_analyze, data_imdb, how="inner", left_on="index", right_on="index_imdb", validate="1:1").drop(columns=["text_imdb", "index_imdb"])

In [None]:
data_analyze.shape

In [None]:
data_analyze.sample(10, random_state=1234)

### Output the dataset

In [None]:
filepath = '../../N2FL NLP/data/03a_sentiment_prep_masked.csv'

In [None]:
data_analyze.to_csv(filepath)