#Introduction
The purpose of this file is to examine various types of sentiment analysis and choose the best, saving the scores from the best model for use in the subesequent network analysis 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install and import libraries

Let's start by importing some libraries that will help with an analysis of Twitter data

In [None]:
!pip install transformers
#!pip install langdetect
#!pip install pycountry
#!pip install emoji
#!python -m spacy download en_core_web_sm
#!conda info

In [None]:
# Import Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import nltk
# This is needed for parsing certain Tweets (You may need to download others for other datasets)
nltk.download('vader_lexicon')
import spacy
nlp = spacy.load('en_core_web_sm')           # A more detailed model (with higher-dimension word vectors) - 13s to load, normally 
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
print('imports complete')

Confirm GPU/high RAM 

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

#Load cleansed data
Use data already labelled with Piper typology predictions

In [None]:
offline_tweets_df = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_tweets_large_Piper_typology.pickle')

display(offline_tweets_df[['id','created_at','user_id','text3','tweet category','is_retweet','is_quote_status','user_descr']])

# Sentiment Analysis

Sentiment Analysis scoring using vader and Textblob

In [None]:
#Sentiment Analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def percentage(part,whole):
 return 100 * float(part)/float(whole)

num_tweets = len(offline_tweets_df.text2)
positive = 0
negative = 0
neutral = 0
polarity = 0
category = ['' for a in range(num_tweets)]
TBpol = [0 for a in range(num_tweets)]
neg = [0 for a in range(num_tweets)]
pos = [0 for a in range(num_tweets)]
neu = [0 for a in range(num_tweets)]
comp = [0 for a in range(num_tweets)]

# add progress bar
from tqdm.auto import tqdm
num_steps = offline_tweets_df.shape[0]
progress_bar = tqdm(range(num_steps))

for index, tweet in offline_tweets_df.iterrows():
 
 TBanalysis = TextBlob(tweet.text2)
 TBpol[index] = TBanalysis.sentiment.polarity
 polarity += TBanalysis.sentiment.polarity
 Vpol = SentimentIntensityAnalyzer().polarity_scores(tweet.text2)
 neg[index] = Vpol['neg']
 neu[index] = Vpol['neu']
 pos[index] = Vpol['pos']
 comp[index] = Vpol['compound']
 
 if neg[index] > pos[index]:
  category[index] = 'neg'
  negative += 1
 elif pos[index] > neg[index]:
  category[index] = 'pos'
  positive += 1
 elif pos[index] == neg[index]:
  category[index] = 'neu'
  neutral += 1
 progress_bar.update(1)

#Number of Tweets (Total, Positive, Negative, Neutral)
print("Total Tweets: ", num_tweets)
print("positive number: ", positive)
print("negative number: ", negative)
print("neutral number: ", neutral)

In [None]:
pd.DataFrame(dict(tweet=list(offline_tweets_df.text2), cat=category, neg=neg, pos=pos, neu=neu, comp=comp, TBpol=TBpol))


Now score using Hugging Face transformers base sentiment-analysis

In [None]:
from transformers import pipeline

In [None]:
# score in batches 
from tqdm.auto import tqdm
num_steps = offline_tweets_df.shape[0]
sz = 141  
# 16638 / 141 = 118 exactly
progress_bar = tqdm(range(num_steps))

b=[]
for i in range(0, num_tweets, sz):
    j = i+sz
    if j>num_tweets: j=num_tweets
    hfs = pipeline("sentiment-analysis", device=0)(list(offline_tweets_df.text2[i:j]))
    b = b + hfs
    progress_bar.update(sz)

a = pd.DataFrame(b)

In [None]:
a.loc[a.label=='NEGATIVE', 'score'] = a.score.loc[a.label=='NEGATIVE']*-1
a

In [None]:
#offline_tweets_df['hfs'] = a.score
temp = pd.DataFrame(dict(tweet=list(offline_tweets_df.text2), cat=category, neg=neg, pos=pos, neu=neu, comp=comp, TBpol=TBpol, hfs=a.score))
#pd.write_csv('D:/james/GDrive/_study/GT_PRA/sentiment.csv')
display(temp)

In [None]:
temp.to_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/temp_sentiment.pickle')

examine sensitivity of HF sentiment to various inputs

In [None]:
pipeline("sentiment-analysis")([
    'this really sux','really, why, isnt it working?',
    'Follow up with @senrobportman on how NOT to fight. He fights for bipartisanship when #GOP is in charge, but is SHOCKED when Democrats go it alone. This huge infrastructure bill could put Dems on defense, but #Swamp doesn’t want the fight.  https://t.co/jb4CY7SaAA',
    'Follow up with on how NOT to fight. He fights for bipartisanship when is in charge, but is SHOCKED when Democrats go it alone. This huge infrastructure bill could put Dems on defense, but doesn’t want the fight.'
    'RT thehill: Senate passes bipartisan $35 billion water infrastructure bill https://t.co/Cy2Be1ECXr https://t.co/YEdAd89eT5',
    'Senate passes bipartisan $35 billion water infrastructure bill  https://t.co/Cy2Be1ECXr https://t.co/YEdAd89eT5',
    'RT @thehill: Senate passes bipartisan $35 billion water infrastructure bill',
    'RT thehill: Senate passes bipartisan $35 billion water infrastructure bill',
    'RT Senate passes bipartisan $35 billion water infrastructure bill',
    'Senate passes bipartisan $35 billion water infrastructure bill',
    'Senate passes terrible bipartisan $35 billion water infrastructure bill',
    'Senate passes trrble bipartisan $35 billion water infrastructure bill',
    'Senate passes fantastic bipartisan $35 billion water infrastructure bill',
    'Senate passes fntastic bipartisan $35 billion water infrastructure bill',
    ])

# RoBERTa sentiment analysis

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

substitute @user and http for user refences and links

In [None]:
from bs4 import BeautifulSoup
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'

def preprocess(text, pat1repl='@user', pat2repl='http'):
    stripped = re.sub(pat1, pat1repl, text)
    stripped = re.sub(pat2, pat2repl, stripped)
    stripped = re.sub(r'\n', '', stripped)
    return stripped

text = offline_tweets_df['text2'].map(lambda x: preprocess(x))

Function to convert logits to probabilities

In [None]:
# from https://stackoverflow.com/questions/43290138/softmax-function-of-a-numpy-array-by-row/68350608#68350608
def arrsoftmax(a, axis=None):
    """
    Computes exp(a)/sumexp(a); relies on scipy logsumexp implementation.
    :param a: ndarray/tensor
    :param axis: axis to sum over; default (None) sums over everything
    """
    from scipy.special import logsumexp
    lse = logsumexp(a, axis=axis)  # this reduces along axis
    if axis is not None:
        lse = np.expand_dims(lse, axis)  # restore that axis for subtraction
    return np.exp(a - lse)

In [None]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL, num_labels=3)

#may need to delete existing cardifffnlp folder for this to run!

labels = ['negative', 'neutral', 'positive']
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


#model.save_pretrained(MODEL)


In [None]:
import torch 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm
num_tweets = len(text)

progress_bar = tqdm(range(num_tweets))

step = 10
for i in range(0, num_tweets, step):
    j = i + step
    if j>num_tweets: j=num_tweets
   #hfs = pipeline("sentiment-analysis", device=0)
    encoded_input = tokenizer(text[i:j].to_list(), return_tensors='pt', padding=True).to(device)
    output = model(**encoded_input)
    betas = output.logits.detach().cpu().numpy()     # beta values
    scores = arrsoftmax(betas, axis=1)          # convert to 0-1 prob range, by row
    if i==0:
        res = pd.DataFrame(scores)
    else:
        res = pd.concat([res,pd.DataFrame(scores)])
    progress_bar.update(step)

##Combine results

In [None]:
res.columns = ['cdf_neg', 'cdf_neu','cdf_pos']
res

In [None]:
temp = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/temp_sentiment.pickle')

In [None]:
temp['cdf_neg']=list(res.cdf_neg)
temp['cdf_neu']=list(res.cdf_neu)
temp['cdf_pos']=list(res.cdf_pos)
#temp('tweet') = list(text)
display(temp)

In [None]:
new_df = pd.merge(offline_tweets_df, temp, left_index=True, right_index=True)
new_df.tail()

In [None]:
new_df.to_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_tweets_large_Piper_sentiment.pickle')