## Imports

In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


## Get Model Info

In [2]:
#MODEL1 = "cardiffnlp/twitter-roberta-base-emoji"
MODEL2 = "cardiffnlp/twitter-roberta-base-emotion"
MODEL3 = "cardiffnlp/twitter-roberta-base-hate"
MODEL4 = "cardiffnlp/twitter-roberta-base-irony"
MODEL5 = "cardiffnlp/twitter-roberta-base-offensive"
MODEL6 = "cardiffnlp/twitter-roberta-base-sentiment-latest"

#model1 = AutoModelForSequenceClassification.from_pretrained(MODEL1)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL2)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL3)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL4)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL5)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL6)

#tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)

Downloading: 100%|██████████████████████████████| 768/768 [00:00<00:00, 113kB/s]
Downloading: 100%|███████████████████████████| 476M/476M [01:28<00:00, 5.65MB/s]
Downloading: 100%|██████████████████████████████| 588/588 [00:00<00:00, 170kB/s]
Downloading: 100%|███████████████████████████| 476M/476M [01:26<00:00, 5.76MB/s]
Downloading: 100%|██████████████████████████████| 589/589 [00:00<00:00, 213kB/s]
Downloading: 100%|███████████████████████████| 476M/476M [01:27<00:00, 5.69MB/s]
Downloading: 100%|██████████████████████████████| 593/593 [00:00<00:00, 227kB/s]
Downloading: 100%|███████████████████████████| 476M/476M [01:28<00:00, 5.65MB/s]
Downloading: 100%|██████████████████████████████| 929/929 [00:00<00:00, 300kB/s]
Downloading: 100%|███████████████████████████| 478M/478M [01:30<00:00, 5.56MB/s]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'robe

In [3]:
mapping_link1 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emoji/mapping.txt"
with urllib.request.urlopen(mapping_link1) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels1 = [row[1] for row in csvreader if len(row) > 1]

mapping_link2 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt"
with urllib.request.urlopen(mapping_link2) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels2 = [row[1] for row in csvreader if len(row) > 1]

mapping_link3 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/hate/mapping.txt"
with urllib.request.urlopen(mapping_link3) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels3 = [row[1] for row in csvreader if len(row) > 1]

mapping_link4 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/irony/mapping.txt"
with urllib.request.urlopen(mapping_link4) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels4 = [row[1] for row in csvreader if len(row) > 1]

mapping_link5 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/mapping.txt"
with urllib.request.urlopen(mapping_link5) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels5 = [row[1] for row in csvreader if len(row) > 1]

mapping_link6 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link6) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels6 = [row[1] for row in csvreader if len(row) > 1]

## Get Data

In [144]:
data = pd.read_csv("PATH TO RAW CSV FILE FROM SNSCRAPE")
#tweets_df = pd.DataFrame(data)
#tweets_df.head(5)

## Define Functions

In [3]:
def preprocess(df):
    tweets = [i for i in df["Text"]]
    preprocessed_tweets = []
    
    for tweet in tweets:
        new_text=[]
        
        for t in tweet.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            t = 'http' if '\nhttp' in t else t 
            new_text.append(t)
            
        preprocessed_tweets.append(" ".join(new_text))
    return preprocessed_tweets

In [127]:
def method2_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer2(text, return_tensors='pt')
        output = model2(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"anger": [], "sadness": [], "optimism": [], "joy": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels2[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["joy"] = results["joy"]
        dofi["optimism"] = results["optimism"]
        dofi["anger"] = results["anger"]
        dofi["sadness"] = results["sadness"]
        
        return dofi
    
    return results                                 

In [128]:
def method3_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer3(text, return_tensors='pt')
        output = model3(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"not-hate": [], "hate": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels3[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["not-hate"] = results["not-hate"]
        dofi["hate"] = results["hate"]
        
        return dofi
    
    return results                                 

In [129]:
def method4_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer4(text, return_tensors='pt')
        output = model4(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"non_irony": [], "irony": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels4[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["not_irony"] = results["non_irony"]
        dofi["irony"] = results["irony"]
        
        return dofi
    
    return results 

In [130]:
def method5_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer5(text, return_tensors='pt')
        output = model5(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"not-offensive": [], "offensive": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels5[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["not_offensive"] = results["not-offensive"]
        dofi["offensive"] = results["offensive"]
        
        return dofi
    
    return results 

In [131]:
def method6_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer6(text, return_tensors='pt')
        output = model6(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"positive": [], "neutral": [], "negative": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels6[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["positive"] = results["positive"]
        dofi["neutral"] = results["neutral"]
        dofi["negative"] = results["negative"]
        
        return dofi
    
    return results 

In [132]:
def tweet_RoBERTa_processing(df):
    prep_df = preprocess(df)
    
    m2 = method2_processing(df, prep_df, True)
    m3 = method3_processing(m2, prep_df, True)
    m4 = method4_processing(m3, prep_df, True)
    m5 = method5_processing(m4, prep_df, True)
    m6 = method6_processing(m5, prep_df, True)
    
    return m6

## Process Your Data

In [145]:
processed_df = tweet_RoBERTa_processing(tweets_df)
processed_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dofi["joy"] = results["joy"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dofi["optimism"] = results["optimism"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dofi["anger"] = results["anger"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Like Count,joy,optimism,anger,sadness,not-hate,hate,not_irony,irony,not_offensive,offensive,positive,neutral,negative
0,0,2022-06-07 02:12:00+00:00,1533995157858795525,"Biden blasted for ‘monumental disaster’ as 11,...",Daily_Express,3,0.0147,0.0611,0.4864,0.4378,0.8953,0.1047,0.2895,0.7105,0.7107,0.2893,0.0131,0.3838,0.6031
1,1,2022-06-06 11:40:05+00:00,1533775732345823232,"More than 45,000 Americans apply to sponsor Uk...",MailOnline,17,0.3515,0.4267,0.1159,0.1059,0.9821,0.0179,0.082,0.918,0.8914,0.1086,0.2166,0.7758,0.0076
2,2,2022-06-05 03:29:56+00:00,1533289994584002561,Anger as hundreds of refugee children from Ukr...,MailOnline,16,0.009,0.0191,0.8268,0.1451,0.9758,0.0242,0.1763,0.8237,0.8,0.2,0.0048,0.1251,0.8701
3,3,2022-06-04 00:40:42+00:00,1532885015918411777,"More than 10,000 migrants have already crossed...",MailOnline,17,0.0449,0.2478,0.1538,0.5535,0.8377,0.1623,0.3815,0.6185,0.7943,0.2057,0.037,0.8017,0.1613
4,4,2022-06-03 23:30:07+00:00,1532867253988884485,This week marked one hundred days since Russia...,MailOnline,16,0.337,0.1641,0.3206,0.1783,0.9676,0.0324,0.2552,0.7448,0.7987,0.2013,0.0175,0.6905,0.292


In [5]:
pd.read_csv('18_19_all_tweets.csv')

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text
0,0,2019-07-27 23:42:06+00:00,1155262113839046661,Mother-to-be is 'heartbroken' after a private ...
1,1,2019-07-27 23:37:24+00:00,1155260929900195841,The Chinese village reclaimed by nature: Remot...
2,2,2019-07-27 23:34:56+00:00,1155260311345270784,Disney actress who voiced Minnie Mouse and mar...
3,3,2019-07-27 23:29:00+00:00,1155258816679690240,'My Birthday boy' Louise Redknapp reunites wit...
4,4,2019-07-27 23:23:00+00:00,1155257306784948225,'I miss him' Debbie McGee reveals how she ‘got...
...,...,...,...,...
247956,247956,2018-07-28 00:22:01+00:00,1023000589750464512,These are the WARNING signs your baby is dange...
247957,247957,2018-07-28 00:21:04+00:00,1023000351593652224,Mysterious tar decorations scrawled on the bon...
247958,247958,2018-07-28 00:09:00+00:00,1022997313021771776,Best supplements for high blood pressure: 2p a...
247959,247959,2018-07-28 00:02:00+00:00,1022995550994669569,Brit mum’s tearful call home as she swam for h...


In [6]:
pd.read_csv('19_20_all_tweets.csv')

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text
0,0,2020-07-27 23:52:00+00:00,1287898563687510016,#Outlander’s Sam Heughan announces new project...
1,1,2020-07-27 23:46:02+00:00,1287897064983220224,Megan Barton Hanson sends temperatures soaring...
2,2,2020-07-27 23:45:00+00:00,1287896801916399622,Emily Maitlis' Newsnight co-star admits they '...
3,3,2020-07-27 23:44:00+00:00,1287896550371258368,What is this 'black cube' seen near the Sun? I...
4,4,2020-07-27 23:42:50+00:00,1287896259408338948,Britain’s crumbling jails struggle to cope wit...
...,...,...,...,...
89155,89155,2020-04-07 22:11:00+00:00,1247648093102698496,🐰 COMPETITION ALERT 🐰\n\nOnly 1 day left to en...
89156,89156,2020-04-07 22:10:00+00:00,1247647841541070850,You won't be able to look away from this massi...
89157,89157,2020-04-07 22:07:00+00:00,1247647086448758784,‘It was bound to happen’ This Morning host Phi...
89158,89158,2020-04-07 22:02:48+00:00,1247646029769576449,Britney Spears shares sexy snaps of toned tumm...
