In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import re, string
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
stop_words = set(stopwords.words('english'))
vectorizer = CountVectorizer(ngram_range=(1, 1))
estimator = DecisionTreeClassifier(max_depth=3)

In [None]:
def load_tf_dataset(name,dir=None):
    if dir==None:
        dataset = load_dataset(name)
    else:
        dataset = load_dataset(name,dir)
    return dataset

def dataset_to_train_test(dataset):
    train_df = pd.DataFrame(dataset["train"])
    test_df = pd.DataFrame(dataset["test"])
    return train_df, test_df

def load_csv_dataset(dir,filename):
    df = pd.read_csv(dir+filename)
    return df

def load_xlxs_dataset(dir,filename):
    df = pd.read_excel(dir+filename,index_col=None)
    return df

def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    return tweet

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

def df_cleaner(df):
    df['text'] = df['text'].apply(clean_tweet)
    df['text'] = df['text'].apply(lambda x: x.split())
    df['text'] = df['text'].apply(remove_stopwords)
    df['text'] = df['text'].apply(lambda x: ' '.join(x))
    return df

def split_dataset(df):
    X = vectorizer.fit_transform(df['text'])
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def assign_dataset(train_df,test_df):
    X_train = vectorizer.fit_transform(train_df['text'])
    X_test = vectorizer.transform(test_df['text'])
    y_train = train_df['label']
    y_test = test_df['label']
    return X_train, X_test, y_train, y_test

def pred_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    LR_pred = model.predict(X_test)
    return LR_pred

def report_gen(y_test, y_pred):
    report = classification_report(y_test,y_pred)
    return report

In [None]:
def vader_sentiment(score):
    threshold = 0.1
    if score['compound'] > threshold:
        label = 2
    elif score['compound'] < -threshold:
        label = 0
    else: label = 1
    return label

def pred_VEDER(model, X, y):
    y_pred = X.apply(lambda x: model.polarity_scores(x))
    y_pred = y_pred.apply(vader_sentiment)
    return y_pred

In [None]:
def textblob_sentiment(text):
    score = TextBlob(text).sentiment
    threshold = 0.1
    if score.polarity > threshold:
        label = 2
    elif score.polarity < -threshold:
        label = 0
    else: label = 1
    return label

def pred_TextBlob(X, y):
    y_pred = X.apply(textblob_sentiment)
    return y_pred

# Models

In [None]:
LR = LogisticRegression()
MNB = MultinomialNB()
BAGG = BaggingClassifier(estimator=estimator, n_estimators=10)
ADA = BaggingClassifier(estimator=estimator, n_estimators=50)
GRAD = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
VADER = SentimentIntensityAnalyzer()
SGD = SGDClassifier()

# All Reports

In [None]:
def get_all_report(X_train, X_test, y_train, y_test, test_df):
    print("Accuracy report of LR:")
    y_pred = pred_model(LR, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of MNB:")
    y_pred = pred_model(MNB, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of BAGG:")
    y_pred = pred_model(BAGG, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of ADA:")
    y_pred = pred_model(ADA, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of GRAD:")
    y_pred = pred_model(GRAD, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of SGD:")
    y_pred = pred_model(SGD, X_train, X_test, y_train, y_test)
    report = report_gen(y_test, y_pred)
    print(report)
    print("Accuracy report of VADER:")
    y_pred = pred_VEDER(VADER, test_df['text'], test_df['label'])
    report = report_gen(test_df['label'], y_pred)
    print(report)
    print("Accuracy report of TextBlob:")
    y_pred = pred_TextBlob(test_df['text'], test_df['label'])
    report = report_gen(test_df['label'], y_pred)
    print(report)

# IMDB Dataset

In [None]:
dataset = load_tf_dataset("imdb")
train_df,test_df = dataset_to_train_test(dataset)

train_df = df_cleaner(train_df)
test_df = df_cleaner(test_df)

X_train, X_test, y_train, y_test = assign_dataset(train_df, test_df)



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
get_all_report(X_train, X_test, y_train, y_test, test_df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     12500
           1       0.87      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84     12500
           1       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.85      0.86      0.85     12500
           1       0.86      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weig

# Tweet Eval Sentimen Dataset

In [None]:
dataset = load_tf_dataset("tweet_eval","sentiment")
train_df,test_df = dataset_to_train_test(dataset)

train_df = df_cleaner(train_df)
test_df = df_cleaner(test_df)

X_train, X_test, y_train, y_test = assign_dataset(train_df, test_df)



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
get_all_report(X_train, X_test, y_train, y_test, test_df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.65      0.42      0.51      3972
           1       0.59      0.69      0.64      5937
           2       0.47      0.55      0.51      2375

    accuracy                           0.58     12284
   macro avg       0.57      0.55      0.55     12284
weighted avg       0.59      0.58      0.57     12284

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.62      0.49      0.55      3972
           1       0.61      0.62      0.62      5937
           2       0.48      0.61      0.53      2375

    accuracy                           0.58     12284
   macro avg       0.57      0.57      0.57     12284
weighted avg       0.59      0.58      0.58     12284

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.80      0.11      0.19      3972
           1       0.54      0.87      0.67      5937
     

# Manual Labeled Dataset with pretrain

In [None]:
train_filename = "labeled-all-tweets-balanced-small.csv"
train_df = load_csv_dataset(dir, train_filename)
train_df = train_df[['rawContent','label']]
train_df = train_df.dropna()
train_df=train_df.rename(columns={"rawContent": "text", "sentiment": "label"})
train_df['label'] = train_df['label'].apply(lambda x: 0 if x=='negative' else (1 if x=='neutral' else 2))
train_df = df_cleaner(train_df)
train_df

Unnamed: 0,text,label
0,Deputy defense minister Russia uses banned unt...,2
1,German Chancellor Olaf Scholzs statements foll...,2
2,Schools Russia ordered conduct patriotic class...,0
3,Breaking News President Biden propose making e...,1
4,Joe Biden wants another 33 billion top billion...,2
...,...,...
1495,message people Russia believe want bloody dest...,0
1496,war Horn much larger RussiaUkraine casualties ...,2
1497,Kevin McCarthy signaling Republicans take Hous...,0
1498,Finnish PM Sanna Marin end war Ukraine way con...,1


In [None]:
dir = "/content/drive/MyDrive/Research/SentimentAnalysis/MargedTweets/"
test_filename = "manual_label_small.xlsx"
test_df = load_xlxs_dataset(dir,test_filename)
test_df = test_df[['text','sentiment']]
test_df = test_df.dropna()
test_df=test_df.rename(columns={"text": "text", "sentiment": "label"})
test_df['label'] = test_df['label'].apply(lambda x: 0 if x=='Negative' else (1 if x=='Neutral' else 2))
test_df = df_cleaner(test_df)
test_df

Unnamed: 0,text,label
0,Russias exile says referring Kremlin sources p...,1
1,Well didnt see coming vicepresident Gazpromban...,1
2,Oleh Psiuk frontman Eurovision winner Kalush O...,2
3,Lt Gen Mark Hertling Ret joins discuss US faci...,2
4,critical window time theyre going set stage ne...,2
...,...,...
284,returned Moscow first time nearly 3 months par...,1
285,UPDATE Reminder Navy still much increased pres...,1
286,Footage appears show evidence Russian soldiers...,2
287,goal west destroy Russia much Ukraine going le...,0


In [None]:
X_train, X_test, y_train, y_test = assign_dataset(train_df,test_df)

In [None]:
get_all_report(X_train, X_test, y_train, y_test, test_df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.60      0.38      0.47       126
           1       0.27      0.60      0.37        57
           2       0.41      0.31      0.36       105

    accuracy                           0.40       288
   macro avg       0.43      0.43      0.40       288
weighted avg       0.47      0.40      0.41       288

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.53      0.44      0.48       126
           1       0.23      0.39      0.29        57
           2       0.36      0.30      0.33       105

    accuracy                           0.38       288
   macro avg       0.38      0.38      0.37       288
weighted avg       0.41      0.38      0.39       288

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.61      0.34      0.44       126
           1       0.23      0.65      0.33        57
     

## Manual Label Only

In [None]:
dir = "/content/drive/MyDrive/Research/SentimentAnalysis/MargedTweets/"
filename = "manual_label_small.xlsx"
df = load_xlxs_dataset(dir,filename)
df = df[['text','sentiment']]
df = df.dropna()
df=df.rename(columns={"text": "text", "sentiment": "label"})
df['label'] = df['label'].apply(lambda x: 0 if x=='Negative' else (1 if x=='Neutral' else 2))
df = df_cleaner(df)
df

Unnamed: 0,text,label
0,Russias exile says referring Kremlin sources p...,1
1,Well didnt see coming vicepresident Gazpromban...,1
2,Oleh Psiuk frontman Eurovision winner Kalush O...,2
3,Lt Gen Mark Hertling Ret joins discuss US faci...,2
4,critical window time theyre going set stage ne...,2
...,...,...
284,returned Moscow first time nearly 3 months par...,1
285,UPDATE Reminder Navy still much increased pres...,1
286,Footage appears show evidence Russian soldiers...,2
287,goal west destroy Russia much Ukraine going le...,0


In [None]:
X_train, X_test, y_train, y_test = split_dataset(df)

In [None]:
get_all_report(X_train, X_test, y_train, y_test, df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.50      0.83      0.62        23
           1       0.33      0.09      0.14        11
           2       0.71      0.50      0.59        24

    accuracy                           0.55        58
   macro avg       0.51      0.47      0.45        58
weighted avg       0.55      0.55      0.52        58

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.50      0.70      0.58        23
           1       0.00      0.00      0.00        11
           2       0.65      0.62      0.64        24

    accuracy                           0.53        58
   macro avg       0.38      0.44      0.41        58
weighted avg       0.47      0.53      0.49        58

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.43      1.00      0.61        23
           1       0.00      0.00      0.00        11
     

# Pre label only

In [None]:
filename = "labeled-all-tweets-balanced-small.csv"
df = load_csv_dataset(dir, filename)
df = df[['rawContent','label']]
df = df.dropna()
df=df.rename(columns={"rawContent": "text", "sentiment": "label"})
df['label'] = df['label'].apply(lambda x: 0 if x=='negative' else (1 if x=='neutral' else 2))
df = df_cleaner(df)
df

Unnamed: 0,text,label
0,Deputy defense minister Russia uses banned unt...,2
1,German Chancellor Olaf Scholzs statements foll...,2
2,Schools Russia ordered conduct patriotic class...,0
3,Breaking News President Biden propose making e...,1
4,Joe Biden wants another 33 billion top billion...,2
...,...,...
1495,message people Russia believe want bloody dest...,0
1496,war Horn much larger RussiaUkraine casualties ...,2
1497,Kevin McCarthy signaling Republicans take Hous...,0
1498,Finnish PM Sanna Marin end war Ukraine way con...,1


In [None]:
X_train, X_test, y_train, y_test = split_dataset(df)

In [None]:
get_all_report(X_train, X_test, y_train, y_test, df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.53      0.51      0.52        99
           1       0.57      0.70      0.63       102
           2       0.58      0.47      0.52        99

    accuracy                           0.56       300
   macro avg       0.56      0.56      0.56       300
weighted avg       0.56      0.56      0.56       300

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.54      0.57      0.55        99
           1       0.60      0.49      0.54       102
           2       0.55      0.64      0.59        99

    accuracy                           0.56       300
   macro avg       0.57      0.56      0.56       300
weighted avg       0.57      0.56      0.56       300

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.58      0.35      0.44        99
           1       0.48      0.84      0.61       102
     

# cardiffnlp/tweet_sentiment_multilingual

In [None]:
dataset = load_tf_dataset("cardiffnlp/tweet_sentiment_multilingual","english")
train_df,test_df = dataset_to_train_test(dataset)

train_df = df_cleaner(train_df)
test_df = df_cleaner(test_df)

X_train, X_test, y_train, y_test = assign_dataset(train_df, test_df)

Downloading builder script:   0%|          | 0.00/4.14k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_sentiment_multilingual/english to /root/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/253k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset tweet_sentiment_multilingual downloaded and prepared to /root/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
get_all_report(X_train, X_test, y_train, y_test, df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.46      0.50      0.48       290
           1       0.39      0.51      0.44       290
           2       0.58      0.34      0.43       290

    accuracy                           0.45       870
   macro avg       0.48      0.45      0.45       870
weighted avg       0.48      0.45      0.45       870

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.45      0.57      0.50       290
           1       0.38      0.31      0.34       290
           2       0.53      0.50      0.52       290

    accuracy                           0.46       870
   macro avg       0.45      0.46      0.45       870
weighted avg       0.45      0.46      0.45       870

Accuracy report of RF:
              precision    recall  f1-score   support

           0       0.58      0.16      0.25       290
           1       0.37      0.86      0.52       290
     

# Sentiment140

In [None]:
def fix_prob_140(df):
    df = df[['text','sentiment']]
    df = df.dropna()
    df=df.rename(columns={"text": "text", "sentiment": "label"})
    df=df.loc[df['label'] != 2]
    df['label'] = df['label'].apply(lambda x: 0 if x==0 else 2)
    return df

dataset = load_tf_dataset("sentiment140")
train_df,test_df = dataset_to_train_test(dataset)

train_df = df_cleaner(train_df)
test_df = df_cleaner(test_df)

train_df = fix_prob_140(train_df)
test_df = fix_prob_140(test_df)

Downloading builder script:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading and preparing dataset sentiment140/sentiment140 to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997...


Downloading data:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/498 [00:00<?, ? examples/s]

Dataset sentiment140 downloaded and prepared to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
labels = train_df['label'].value_counts().sort_index().reset_index(name='counts')
display(labels)
labels = test_df['label'].value_counts().sort_index().reset_index(name='counts')
display(labels)

Unnamed: 0,index,counts
0,0,800000
1,2,800000


Unnamed: 0,index,counts
0,0,177
1,2,182


In [None]:
X_train, X_test, y_train, y_test = assign_dataset(train_df, test_df)

In [None]:
get_all_report(X_train, X_test, y_train, y_test, test_df)

Accuracy report of LR:
              precision    recall  f1-score   support

           0       0.84      0.78      0.81       177
           2       0.80      0.85      0.82       182

    accuracy                           0.82       359
   macro avg       0.82      0.82      0.82       359
weighted avg       0.82      0.82      0.82       359

Accuracy report of MNB:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       177
           2       0.82      0.82      0.82       182

    accuracy                           0.82       359
   macro avg       0.82      0.82      0.82       359
weighted avg       0.82      0.82      0.82       359

Accuracy report of BAGG:
              precision    recall  f1-score   support

           0       0.83      0.03      0.05       177
           2       0.51      0.99      0.68       182

    accuracy                           0.52       359
   macro avg       0.67      0.51      0.37       359
we