# Basic Setup


### Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn import model_selection
from sklearn import ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

from sklearn.model_selection import cross_val_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM

import stanza
# stanza.download('en')

2021-12-28 13:23:49.897509: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Check that GPU loaded correctly

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


2021-12-28 13:23:55.777340: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-28 13:23:55.779490: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-28 13:23:55.781067: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-28 13:23:55.847353: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:84:00.0 name: Tesla K80 computeCapability: 3.7
coreClock: 0.8235GHz coreCount: 13 deviceMemorySize: 11.17GiB deviceMemoryBandwidth: 223.96GiB/s
2021-12-28 13:23:55.847390: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully

### Grab environment variables

In [3]:
totalarrays=int(os.getenv('arraymaxbk'))
currentarray=int(os.getenv('arraycurrentbk'))
filenameset=str(os.getenv('filenameset'))
suffixset=str(os.getenv('suffixset'))
rootset=str(os.getenv('rootset'))
datagroupset=str(os.getenv('datagroupset'))

In [3]:
#totalarrays=160
#currentarray=1
#filenameset="03_test_texts_sentence_21-12-20.dta"
#suffixset="21-12-20"
#rootset="/project/commonappteacherrec/teacher_rec_full_replication/data/build"
#datagroupset="test"

### Load in and organize the teacher recommendation text data

In [4]:
filename = rootset + "/" + filenameset

In [5]:
teacherrec = pd.read_stata(filename)
teacherrec["index"]=teacherrec.index
teacherrec.shape

(49792681, 3)

### Subset data into partitions for array job

In [6]:
datalength = len(teacherrec.index)

In [7]:
#partitionsize = np.ceil(len(teacherrec.index)/totalarrays).astype(np.int64)
partitionsize = round(datalength/totalarrays)
partitionsize

311204

In [8]:
bottombound = (partitionsize * (currentarray-1))
bottombound

0

In [9]:
if currentarray==totalarrays:
    topbound = datalength
else:
    topbound = (partitionsize * currentarray)
topbound

311204

In [10]:
subset = teacherrec[bottombound:topbound]
#subset = teacherrec[1:100]

In [12]:
del teacherrec

# Begin creating NLP sentiment analysis pipeline

### Define various sentiment analysis models

In [5]:
# https://huggingface.co/gilf/english-yelp-sentiment
yelp_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("gilf/english-yelp-sentiment", revision="5dede457b5f58f45edcd71ccfe10953371c630ef"), tokenizer = AutoTokenizer.from_pretrained("gilf/english-yelp-sentiment", revision="5dede457b5f58f45edcd71ccfe10953371c630ef"), device = 0, return_all_scores=True)

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
bert_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment", revision="f4067398d9230016de89fc62c43e4ba42c349c72"), tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment", revision="f4067398d9230016de89fc62c43e4ba42c349c72"), device = 0, return_all_scores=True)

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
twit_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", revision="c8c5458081108134d5b2e5fc2ab4215b677ed0b4"), tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", revision="c8c5458081108134d5b2e5fc2ab4215b677ed0b4"), device = 0, return_all_scores=True)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [8]:
# https://huggingface.co/textattack/xlnet-large-cased-SST-2
xlnet_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("textattack/xlnet-base-cased-SST-2", revision="9ceeb077dcd5cf5ae790572b2bd6aec755a263be"), tokenizer = AutoTokenizer.from_pretrained("textattack/xlnet-base-cased-SST-2", revision="9ceeb077dcd5cf5ae790572b2bd6aec755a263be"), device = 0, return_all_scores=True)

Downloading:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/469M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/202 [00:00<?, ?B/s]

In [9]:
# https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment
imdb_all = pipeline(task="text2text-generation", model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment", revision="d9d412418ff1a359b7783eeebd5b318791f00765"), tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment", revision="d9d412418ff1a359b7783eeebd5b318791f00765"), device = 0)

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [10]:
# https://huggingface.co/textattack/albert-base-v2-SST-2
albert_all = pipeline(task="sentiment-analysis", model = AutoModelForSequenceClassification.from_pretrained("textattack/albert-base-v2-SST-2", revision="96d7dedb92b3679c4f1ae69e7e77440d058d8602"), tokenizer = AutoTokenizer.from_pretrained("textattack/albert-base-v2-SST-2", use_fast=False, revision="96d7dedb92b3679c4f1ae69e7e77440d058d8602"), device = 0, return_all_scores=True)

Downloading:   0%|          | 0.00/732 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [12]:
stanza_top = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

2021-09-29 09:58:05 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2021-09-29 09:58:05 INFO: Use device: gpu
2021-09-29 09:58:05 INFO: Loading: tokenize
2021-09-29 09:58:05 INFO: Loading: sentiment
2021-09-29 09:58:05 INFO: Done loading processors!


### Define pipeline for running sentiment analysis and charting output

In [13]:
# Function to run the sentiment analysis algorithm on a specific string of text
# and input the results into a tidy series
#
# ARGS:
# text - String of text to be analyzed
# model - one of "yelp", "bert", "twit", "xlnet", "albert", or "stanza"

def save_sentiment(text, model):
    # Run analyses and save results
    if model=="yelp":
        temp_all = yelp_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_1': temp_all[0][1]["score"], 'LABEL_2': temp_all[0][2]["score"], 'LABEL_3': temp_all[0][3]["score"], 'LABEL_4': temp_all[0][4]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)
    
    elif model=="bert":
        temp_all = bert_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_1': temp_all[0][1]["score"], 'LABEL_2': temp_all[0][2]["score"], 'LABEL_3': temp_all[0][3]["score"], 'LABEL_4': temp_all[0][4]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="twit":
        temp_all = twit_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_2': temp_all[0][1]["score"], 'LABEL_4': temp_all[0][2]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="xlnet":
        temp_all = xlnet_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_4': temp_all[0][1]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)

    elif model=="imdb":
        temp_all = imdb_all(text)

        # Output results into a dataframe
        if temp_all[0]["generated_text"]=="negative":
            d = pd.Series({'label_predict': "LABEL_0"})
        elif temp_all[0]["generated_text"]=="positive":
            d = pd.Series({'label_predict': "LABEL_4"})
        else:
            d = pd.Series({'label_predict': "LABEL_2"})

    elif model=="albert":
        temp_all = albert_all(text)

        # Output results into a dataframe
        d = pd.Series({'LABEL_0':temp_all[0][0]["score"], 'LABEL_4': temp_all[0][1]["score"]})
        d2 = pd.Series({'label_predict': d.idxmax(), 'label_prob': d.max()})
        d = d.append(d2)
    
    elif model=="stanza":
        temp = stanza_top(text)
        for i, sentence in enumerate(temp.sentences):
            if sentence.sentiment==0:
                d = pd.Series({'label_predict': "LABEL_0"})
            elif sentence.sentiment==1:
                d = pd.Series({'label_predict': "LABEL_2"})
            elif sentence.sentiment==2:
                d = pd.Series({'label_predict': "LABEL_4"})

    # Print it
    return d
  

In [14]:
def get_sentiment(dataset, text_column, model):
    start_time = time.time()

    temp = dataset[text_column].apply(save_sentiment, model=model)
    print("--- %s seconds ---" % (time.time() - start_time))
    temp = pd.concat([dataset, temp], axis=1)
    return temp

In [15]:
# Function to quickly and visually display the accuracy metrics of the sentiment
# analysis algorithm in a confusion matrix and standard measures
#
# ARGS:
# truevals - Dataframe column of true values
# predictvals - Dataframe column of predicted values

def prediction_test(truevals, predictvals):
    sns.set(font_scale=1.2, rc={'figure.figsize':(11.7,8.27)})
    cm = confusion_matrix(truevals, predictvals)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Greens', fmt='g')
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')

    print(classification_report(truevals, predictvals))

# Apply to the teacher recommendation dataset

### Apply the Sentiment Analysis algorithms to masked N2FL text data

In [16]:
teacherrec_yelp = get_sentiment(subset, "sentence_text", "yelp").add_suffix("_yelp")

--- 0.7965748310089111 seconds ---


In [17]:
teacherrec_albert = get_sentiment(subset, "sentence_text", "albert").add_suffix("_albert")

--- 0.9753954410552979 seconds ---


In [18]:
teacherrec_xlnet = get_sentiment(subset, "sentence_text", "xlnet").add_suffix("_xlnet")

--- 1.2450222969055176 seconds ---


In [19]:
# Minor cleaning steps because Stanza can't take empty rows
stanza_prep = subset.copy()
stanza_prep["str_len"] = stanza_prep["sentence_text"].str.len()
stanza_prep = stanza_prep[stanza_prep['str_len'] >= 1].drop(columns=["str_len"])

In [20]:
teacherrec_stanza = get_sentiment(stanza_prep, "sentence_text", "stanza").add_suffix("_stanza")

--- 0.7307009696960449 seconds ---


In [21]:
teacherrec_bert = get_sentiment(subset, "sentence_text", "bert").add_suffix("_bert")

--- 0.7857115268707275 seconds ---


In [22]:
teacherrec_twit = get_sentiment(subset, "sentence_text", "twit").add_suffix("_twit")

--- 0.8151977062225342 seconds ---


In [23]:
# Minor cleaning steps because imdb can't take non-ascii characters apparently
imdb_prep = subset.copy()
imdb_prep["sentence_text"] = imdb_prep["sentence_text"].str.encode("ascii", "ignore").str.decode("ascii")

In [24]:
teacherrec_imdb = get_sentiment(imdb_prep, "sentence_text", "imdb").add_suffix("_imdb")

--- 2.6876425743103027 seconds ---


### Join sentiment scores back into the main dataset

In [25]:
teacherrec_analyze = pd.merge(subset, teacherrec_yelp, how="inner", left_on="index", right_on="index_yelp", validate="1:1").drop(columns=["sentence_text_yelp", "index_yelp", "letterid_yelp"])

In [26]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_xlnet, how="inner", left_on="index", right_on="index_xlnet", validate="1:1").drop(columns=["sentence_text_xlnet", "index_xlnet", "letterid_xlnet"])

In [27]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_albert, how="inner", left_on="index", right_on="index_albert", validate="1:1").drop(columns=["sentence_text_albert", "index_albert", "letterid_albert"])

In [28]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_stanza, how="inner", left_on="index", right_on="index_stanza", validate="1:1").drop(columns=["sentence_text_stanza", "index_stanza", "letterid_stanza"])

In [29]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_bert, how="inner", left_on="index", right_on="index_bert", validate="1:1").drop(columns=["sentence_text_bert", "index_bert", "letterid_bert"])

In [30]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_twit, how="inner", left_on="index", right_on="index_twit", validate="1:1").drop(columns=["sentence_text_twit", "index_twit", "letterid_twit"])

In [31]:
teacherrec_analyze = pd.merge(teacherrec_analyze, teacherrec_imdb, how="inner", left_on="index", right_on="index_imdb", validate="1:1").drop(columns=["sentence_text_imdb", "index_imdb", "letterid_imdb"])

In [32]:
teacherrec_analyze.shape

(99, 32)

In [33]:
teacherrec_analyze.sample(10, random_state=1234)

Unnamed: 0,invitationid,sentence_text,index,LABEL_0_yelp,LABEL_1_yelp,LABEL_2_yelp,LABEL_3_yelp,LABEL_4_yelp,label_predict_yelp,label_prob_yelp,...,LABEL_3_bert,LABEL_4_bert,label_predict_bert,label_prob_bert,LABEL_0_twit,LABEL_2_twit,LABEL_4_twit,label_predict_twit,label_prob_twit,label_predict_imdb
39,4579.0,"In twenty years, <NAME> is someone we will be ...",40,0.002651,0.002115,0.006538,0.084025,0.904671,LABEL_4,0.904671,...,0.183271,0.783906,LABEL_4,0.783906,0.001134,0.019604,0.979262,LABEL_4,0.979262,LABEL_4
35,4579.0,"Her experiences in DECA, Speech Team, Student ...",36,0.002401,0.00114,0.002212,0.024717,0.96953,LABEL_4,0.96953,...,0.384946,0.49633,LABEL_4,0.49633,0.001699,0.256818,0.741482,LABEL_4,0.741482,LABEL_4
64,4598.0,"<NAME> loves learning and education, and whate...",65,0.001397,0.001359,0.006574,0.067656,0.923014,LABEL_4,0.923014,...,0.47821,0.432613,LABEL_3,0.47821,0.004148,0.072997,0.922855,LABEL_4,0.922855,LABEL_4
74,4616.0,His assignments were always completed on time ...,75,0.000594,0.000898,0.009597,0.172629,0.816282,LABEL_4,0.816282,...,0.315221,0.635724,LABEL_4,0.635724,0.005954,0.143479,0.850567,LABEL_4,0.850567,LABEL_4
55,4598.0,"For instance, one of her favorite projects was...",56,0.092135,0.219371,0.318005,0.231076,0.139413,LABEL_2,0.318005,...,0.325371,0.342058,LABEL_4,0.342058,0.001884,0.083034,0.915082,LABEL_4,0.915082,LABEL_4
87,4616.0,"He is certainly deserving, and I cannot imagin...",88,0.001734,0.001534,0.005647,0.049097,0.941989,LABEL_4,0.941989,...,0.246759,0.711338,LABEL_4,0.711338,0.032989,0.117567,0.849443,LABEL_4,0.849443,LABEL_4
57,4598.0,They had to communicate the idea primarily wit...,58,0.484267,0.382999,0.121419,0.008371,0.002944,LABEL_0,0.484267,...,0.067,0.012075,LABEL_1,0.41841,0.111573,0.833518,0.054909,LABEL_2,0.833518,LABEL_4
27,4579.0,"Needless to say, when she appeared on my Advan...",28,0.176622,0.160322,0.196781,0.174645,0.29163,LABEL_4,0.29163,...,0.187268,0.099072,LABEL_1,0.295806,0.003098,0.062289,0.934613,LABEL_4,0.934613,LABEL_4
75,4616.0,"Most impressively, he has taken and passed the...",76,0.001393,0.003233,0.03172,0.271351,0.692302,LABEL_4,0.692302,...,0.24831,0.712797,LABEL_4,0.712797,0.003804,0.105915,0.890281,LABEL_4,0.890281,LABEL_4
82,4616.0,<NAME> is also on our school's cross-country t...,83,0.051705,0.027155,0.060549,0.170278,0.690313,LABEL_4,0.690313,...,0.38154,0.458656,LABEL_4,0.458656,0.004868,0.749056,0.246076,LABEL_2,0.749056,LABEL_4


### Output the dataset

In [9]:
#filepath = rootset + '/output_%s' % suffixset

In [11]:
#folderchecker = os.path.exists(filepath)
#if folderchecker==False:
#    os.mkdir(filepath)

In [3]:
#filepath = filepath + '/06_sentiment_prep_%s.csv' % currentarray
#filepath = rootset + '/06_sentiment_prep_%s.csv' % currentarray

In [4]:
filepath = rootset + '/06_%s_' % datagroupset
filepath = filepath + 'sentiment_prep_%s.csv' % currentarray

In [47]:
teacherrec_analyze.to_csv(filepath)