# <p style="background-color: #1DA1F2; font-family:calibri; color:#FFFFFF; font-size:140%; font-family:Monospace; text-align:center; border-radius:15px 50px;">Importing Necessary Libraries</p>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

import re

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron

True

In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

# <p style="background-color: #1DA1F2; font-family:calibri; color:#FFFFFF; font-size:140%; font-family:Monospace; text-align:center; border-radius:15px 50px;">Loading and Preparing the Data</p>

In [4]:
# Define column names
columns = ["tweet id", "entity", "sentiment", "content"]

# Load data
train_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', names=columns)
val_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=columns)

# Combine training and validation dataframes
combined_df = pd.concat([train_df, val_df], ignore_index=True)

# Split combined data into train, validation, and test sets
train_val_df, test_df = train_test_split(combined_df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

In [5]:
train_df

Unnamed: 0,tweet id,entity,sentiment,content
59201,3347,Facebook,Irrelevant,of the white Americans invading my Facebook co...
51278,10401,RedDeadRedemption(RDR),Positive,I won Breakout in Red Dead Redeemtion 2 for 7 ...
54261,2121,CallOfDuty,Positive,@PlayApex was on nonstop until @CallofDuty sto...
19468,12533,WorldOfCraft,Neutral,I always get sad lurking in the troll starter ...
5878,212,Amazon,Irrelevant,@Airtel_Presence - we have been charged for ca...
...,...,...,...,...
54510,2163,CallOfDuty,Irrelevant,Join me and @GamingMansfield as we hold hands ...
59337,3370,Facebook,Neutral,@EntertainerToys Hi placed an application whic...
74690,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...
62815,5168,GrandTheftAuto(GTA),Neutral,"* i steal flying motorcycle in gta *. ""are you..."


In [6]:
val_df

Unnamed: 0,tweet id,entity,sentiment,content
10115,12941,Xbox(Xseries),Positive,PS5 launching even a penny later than the Xbox...
58752,3272,Facebook,Positive,"Facebook showed me this pic from 3 years ago, ..."
51919,10520,RedDeadRedemption(RDR),Neutral,* runs aimlessly online on Red Dead Redemption...
2430,1625,CallOfDutyBlackopsColdWar,Irrelevant,I’m retired. pic.twitter.com/dAduilmjlT
24507,4604,Google,Neutral,This means if I write a review of you business...
...,...,...,...,...
75092,12725,WorldOfCraft,Irrelevant,Oh boy I bet the responses to this in the comm...
12645,8573,NBA2K,Negative,Sorry @NBA2K I had a damn MyTeam game. I’ve co...
14410,2869,Dota2,Positive,IT WELCOME [UNK]
36571,8275,Microsoft,Positive,Really think


In [7]:
test_df

Unnamed: 0,tweet id,entity,sentiment,content
42246,10048,PlayerUnknownsBattlegrounds(PUBG),Irrelevant,Noob To Pro A Motivational Journey | Story Of ...
20182,12655,WorldOfCraft,Negative,But I hated everything about this.. But I... j...
42114,10026,PlayerUnknownsBattlegrounds(PUBG),Neutral,haven’t played pubg for more than a week. feel...
14603,2901,Dota2,Negative,the cute panda and dragon tress they just stea...
30347,7215,LeagueOfLegends,Irrelevant,I have no idea what he is for is positive and ...
...,...,...,...,...
48538,5932,HomeDepot,Negative,"please don ’ t come to no home on depot today,..."
51685,10477,RedDeadRedemption(RDR),Positive,That all the little things have formed an incr...
56340,11269,TomClancysRainbowSix,Negative,5 years old and it’s in worse condition than B...
9330,12804,Xbox(Xseries),Irrelevant,I might have to get an Xbox Series X just beca...


In [8]:
train_df.drop(columns = ["tweet id", "entity"], inplace = True)
val_df.drop(columns = ["tweet id", "entity"], inplace = True)
test_df.drop(columns = ["tweet id", "entity"], inplace = True)

In [9]:
train_df["sentiment"].value_counts()

sentiment
Negative      18489
Positive      17074
Neutral       15054
Irrelevant    10684
Name: count, dtype: int64

In [10]:
val_df["sentiment"].value_counts()

sentiment
Negative      2038
Positive      1908
Neutral       1659
Irrelevant    1207
Name: count, dtype: int64

In [11]:
test_df["sentiment"].value_counts()

sentiment
Negative      2281
Positive      2127
Neutral       1890
Irrelevant    1271
Name: count, dtype: int64

In [12]:
sentiment_to_number = {
    'Irrelevant' : 0,
    'Negative' : 1,
    'Neutral' : 2,
    'Positive' : 3
}

train_df['label_sentiment'] = train_df['sentiment'].map(sentiment_to_number)
val_df['label_sentiment'] = val_df['sentiment'].map(sentiment_to_number)
test_df['label_sentiment'] = test_df['sentiment'].map(sentiment_to_number)

In [13]:
train_df.isna().sum()

sentiment            0
content            548
label_sentiment      0
dtype: int64

In [14]:
val_df.isna().sum()

sentiment           0
content            62
label_sentiment     0
dtype: int64

In [15]:
test_df.isna().sum()

sentiment           0
content            76
label_sentiment     0
dtype: int64

In [16]:
train_df.dropna(inplace = True)
train_df.isna().sum()

sentiment          0
content            0
label_sentiment    0
dtype: int64

In [17]:
val_df.dropna(inplace = True)
val_df.isna().sum()

sentiment          0
content            0
label_sentiment    0
dtype: int64

In [18]:
test_df.dropna(inplace = True)
test_df.isna().sum()

sentiment          0
content            0
label_sentiment    0
dtype: int64

In [19]:
train_df

Unnamed: 0,sentiment,content,label_sentiment
59201,Irrelevant,of the white Americans invading my Facebook co...,0
51278,Positive,I won Breakout in Red Dead Redeemtion 2 for 7 ...,3
54261,Positive,@PlayApex was on nonstop until @CallofDuty sto...,3
19468,Neutral,I always get sad lurking in the troll starter ...,2
5878,Irrelevant,@Airtel_Presence - we have been charged for ca...,0
...,...,...,...
54510,Irrelevant,Join me and @GamingMansfield as we hold hands ...,0
59337,Neutral,@EntertainerToys Hi placed an application whic...,2
74690,Positive,my ass still knee-deep in Assassins Creed Odys...,3
62815,Neutral,"* i steal flying motorcycle in gta *. ""are you...",2


In [20]:
val_df

Unnamed: 0,sentiment,content,label_sentiment
10115,Positive,PS5 launching even a penny later than the Xbox...,3
58752,Positive,"Facebook showed me this pic from 3 years ago, ...",3
51919,Neutral,* runs aimlessly online on Red Dead Redemption...,2
2430,Irrelevant,I’m retired. pic.twitter.com/dAduilmjlT,0
24507,Neutral,This means if I write a review of you business...,2
...,...,...,...
75092,Irrelevant,Oh boy I bet the responses to this in the comm...,0
12645,Negative,Sorry @NBA2K I had a damn MyTeam game. I’ve co...,1
14410,Positive,IT WELCOME [UNK],3
36571,Positive,Really think,3


In [21]:
test_df

Unnamed: 0,sentiment,content,label_sentiment
42246,Irrelevant,Noob To Pro A Motivational Journey | Story Of ...,0
20182,Negative,But I hated everything about this.. But I... j...,1
42114,Neutral,haven’t played pubg for more than a week. feel...,2
14603,Negative,the cute panda and dragon tress they just stea...,1
30347,Irrelevant,I have no idea what he is for is positive and ...,0
...,...,...,...
50005,Negative,No danger???...... this list is big trash,1
48538,Negative,"please don ’ t come to no home on depot today,...",1
51685,Positive,That all the little things have formed an incr...,3
56340,Negative,5 years old and it’s in worse condition than B...,1


# <p style="background-color: #1DA1F2; font-family:calibri; color:#FFFFFF; font-size:140%; font-family:Monospace; text-align:center; border-radius:15px 50px;">Text Preprocessing</p>

In [22]:
nltk.download('wordnet')
nltk.download('stopwords')

wnl = WordNetLemmatizer()
stop_words = stopwords.words('english')

# Convert treebank POS tags to wordnet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocessing_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub('[^A-Za-z]+', ' ', text)
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)

    lemmatized_words = [
        wnl.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
        if word.lower() not in stop_words and len(word) >= 3
    ]

    return ' '.join(lemmatized_words).lower()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
print(f"Before preprocessing : {train_df['content'][1500]}\n")
print(f"After preprocessing : {preprocessing_text(train_df['content'][1500])}")

Before preprocessing : There are tons and tons of quality of life improvements everywhere and it's so painful to see how the writing has taken such a massive hit and continues to disrespect both the characters of the Borderlands universe and the fans of said characters.

After preprocessing : ton ton quality life improvement everywhere painful see writing take massive hit continue disrespect character borderlands universe fan say character


In [24]:
train_df["content"] = train_df["content"].apply(preprocessing_text)
val_df["content"] = val_df["content"].apply(preprocessing_text)
test_df["content"] = test_df["content"].apply(preprocessing_text)

In [25]:
train_df

Unnamed: 0,sentiment,content,label_sentiment
59201,Irrelevant,white americans invade facebook complain black...,0
51278,Positive,breakout red dead redeemtion truemovement pt t...,3
54261,Positive,playapex nonstop callofduty stop,3
19468,Neutral,always get sad lurking troll starter area trol...,2
5878,Irrelevant,airtel presence charge carry amazon prime memb...,0
...,...,...,...
54510,Irrelevant,join gamingmansfield hold hand welcome new age...,0
59337,Neutral,entertainertoys place application turn zero cu...,2
74690,Positive,as still knee deep assassins creed odyssey way...,3
62815,Neutral,steal fly motorcycle gta new player yes need m...,2


In [26]:
val_df

Unnamed: 0,sentiment,content,label_sentiment
10115,Positive,launch even penny later xbox series excite,3
58752,Positive,facebook show pic year ago one red backpack fr...,3
51919,Neutral,run aimlessly online red dead redemption compl...,2
2430,Irrelevant,retired pic twitter com daduilmjlt,0
24507,Neutral,mean write review business establishment see w...,2
...,...,...,...
75092,Irrelevant,boy bet response comment level head barrage ou...,0
12645,Negative,sorry nba damn myteam game cool quick keeping ...,1
14410,Positive,welcome unk,3
36571,Positive,really think,3


In [27]:
test_df

Unnamed: 0,sentiment,content,label_sentiment
42246,Irrelevant,noob pro motivational journey story every noob...,0
20182,Negative,hat everything earn tumor youth achievement,1
42114,Neutral,play pubg week feel weird,2
14603,Negative,cute panda dragon tress steal much resource no...,1
30347,Irrelevant,idea positive obviously superm,0
...,...,...,...
50005,Negative,danger list big trash,1
48538,Negative,please come home depot today bother people lowe,1
51685,Positive,little thing form incredible whole,3
56340,Negative,year old bad condition black ice get operation...,1


# <p style="background-color: #1DA1F2; font-family:calibri; color:#FFFFFF; font-size:140%; font-family:Monospace; text-align:center; border-radius:15px 50px;">Modelling</p>

In [29]:
# Define the pipeline
pipeline = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('random_forest', RandomForestClassifier(min_samples_split=10, oob_score=True, criterion='entropy' ,max_features='log2', class_weight='balanced'))
])

# Fit the pipeline on the training data
pipeline.fit(train_df['content'], train_df['label_sentiment'])

# Get the predictions
y_pred_train = pipeline.predict(train_df['content'])
y_pred_val = pipeline.predict(val_df['content'])
y_pred_test = pipeline.predict(test_df['content'])

# Print Accuracy
print(f"Train Accuracy: {accuracy_score(train_df['label_sentiment'], y_pred_train)}")
print(f"Validation Accuracy: {accuracy_score(val_df['label_sentiment'], y_pred_val)}")
print(f"Test Accuracy: {accuracy_score(test_df['label_sentiment'], y_pred_test)}")

Train Accuracy: 0.9717051009826675
Validation Accuracy: 0.9312592592592592
Test Accuracy: 0.9279327372214067
