# Coronavirus Tweet Sentiment Classification

Data comes from Kaggle: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  return f(*args, **kwds)


# Load and Preprocess Data

In [9]:
data_fp = 'covid_tweets.csv'
df = pd.read_csv(data_fp)
df.columns = [col.lower() for col in df.columns]
df.shape

(3798, 6)

In [10]:
df.head()

Unnamed: 0,username,screenname,location,tweetat,originaltweet,sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [11]:
df.sentiment.value_counts()

Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: sentiment, dtype: int64

In [12]:
# create numeric label
conditions = [((df['sentiment'] == 'Negative') | (df['sentiment'] == 'Extremely Negative')),
              ((df['sentiment'] == 'Positive') | (df['sentiment'] == 'Extremely Positive')),
              (df['sentiment'] == 'Neutral')]
values = [-1, 1, 0]
df.loc[:, 'label'] = np.select(conditions, values)
df.label.value_counts()

-1    1633
 1    1546
 0     619
Name: label, dtype: int64

In [13]:
df.head()

Unnamed: 0,username,screenname,location,tweetat,originaltweet,sentiment,label
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,-1
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,1
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive,1
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,-1
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,0


In [22]:
clf_cols = ['originaltweet', 'label']
df2 = df[clf_cols].rename({'originaltweet':'text'}, axis=1)
df2.head()

Unnamed: 0,text,label
0,TRENDING: New Yorkers encounter empty supermar...,-1
1,When I couldn't find hand sanitizer at Fred Me...,1
2,Find out how you can protect yourself and love...,1
3,#Panic buying hits #NewYork City as anxious sh...,-1
4,#toiletpaper #dunnypaper #coronavirus #coronav...,0


# BERT Classification Tutorial

https://huggingface.co/docs/transformers/tasks/sequence_classification

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [51]:
def preprocess_function(tweet_text):
    return tokenizer(tweet_text, truncation=True)

In [40]:
sample = df2.sample(1)['text'].values[0]
tok_samp = tokenizer(sample)
print(f"sample:\n{sample}\n\ntokenized sample:\n{tok_samp}")

sample:
So, where was all this persecuting price gougers during the bitcoin mining gouging of video card prices? ?? #gaming #bitcoinmining #Covid_19 #coronavirus

tokenized sample:
{'input_ids': [101, 2061, 1010, 2073, 2001, 2035, 2023, 2566, 3366, 29163, 3976, 2175, 22890, 2869, 2076, 1996, 2978, 3597, 2378, 5471, 2175, 15916, 2075, 1997, 2678, 4003, 7597, 1029, 1029, 1029, 1001, 10355, 1001, 2978, 3597, 2378, 25300, 3070, 1001, 2522, 17258, 1035, 2539, 1001, 21887, 23350, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [41]:
len(tok_samp['input_ids'])

47

In [42]:
len(sample)

155

In [52]:
# get tokens and attention mask for each tweet text
df2.loc[:, 'tok_text'] = df2['text'].apply(lambda x: preprocess_function(x))
df2.head()

Unnamed: 0,text,label,tok_text
0,TRENDING: New Yorkers encounter empty supermar...,-1,"[input_ids, attention_mask]"
1,When I couldn't find hand sanitizer at Fred Me...,1,"[input_ids, attention_mask]"
2,Find out how you can protect yourself and love...,1,"[input_ids, attention_mask]"
3,#Panic buying hits #NewYork City as anxious sh...,-1,"[input_ids, attention_mask]"
4,#toiletpaper #dunnypaper #coronavirus #coronav...,0,"[input_ids, attention_mask]"
