### Importing Python Libraries

In [1]:
import numpy as np
import pandas as pd
import neattext.functions as nfx
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import joblib

### Loading the Datasets and Merging them into Single Dataset

In [2]:
df1 = pd.read_csv('../../Datasets/goemotions_1.csv')
df2 = pd.read_csv('../../Datasets/goemotions_2.csv')
df3 = pd.read_csv('../../Datasets/goemotions_3.csv')

df = pd.concat([df1,df2,df3],axis=0)
df.sample(10)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
35685,You d be surprised how “ 3rd”world slum people...,edc3u88,trueblueozguy,sydney,t3_acwmur,t1_edbvuor,1546729000.0,5,False,0,...,0,0,0,0,0,0,0,0,0,1
18150,"Wow, foot and car traffic downtown has changed...",eer4vq0,dweet,ripcity,t3_airzek,t3_airzek,1548229000.0,37,False,0,...,0,0,0,0,1,0,0,0,1,0
19996,It's seriously terrifying!,ef5pcyk,LDawnGrey,TalesFromYourServer,t3_akaq9x,t1_ef5jhgg,1548660000.0,34,False,0,...,0,0,0,0,0,0,0,0,0,0
70285,Yep. They'll declare a national emergency and ...,eduskcb,monkeiboi,Conservative,t3_af0n51,t3_af0n51,1547254000.0,46,False,0,...,0,0,0,0,0,0,0,0,0,0
41450,Does she not know how cups work?,eekkao4,Jimmy6shoes,holdmycosmo,t3_ahzwn8,t1_eeka3av,1548021000.0,61,False,0,...,0,0,0,0,0,0,0,0,0,0
58510,Everytime I see something in this sub that loo...,ed1qd4j,PhilosophyOfMe,yesyesyesyesno,t3_abgvdv,t3_abgvdv,1546389000.0,62,False,0,...,0,0,0,0,0,0,0,0,0,0
23162,That would be an unfamiliar feeling,ed6ty6b,ThrowawayAccount5950,traaaaaaannnnnnnnnns,t3_acc9m5,t3_acc9m5,1546562000.0,36,False,0,...,0,0,0,0,0,0,0,0,0,1
22113,"Never made it through, it makes me physically ...",efdmmkq,an_agreeing_dothraki,forwardsfromgrandma,t3_ald80f,t1_efdcdqu,1548873000.0,62,False,0,...,0,0,0,0,0,0,0,0,0,0
70918,What a fuckin n,ee7i7xe,D1ver_,sadcringe,t3_agnivt,t3_agnivt,1547661000.0,46,False,0,...,0,0,0,0,0,0,0,0,0,0
49093,I too am a vaginal flap surrounded by useless ...,eeznsow,DerpyUncleSteve,IncelTears,t3_ajok6p,t1_eeyhdcj,1548478000.0,57,False,0,...,0,0,0,0,0,0,0,0,0,0


### Listing the Emotions Columns by Type

In [3]:
positive = [
    "admiration","amusement","approval","caring","curiosity","desire","excitement",
    "gratitude","joy","love","optimism","pride","relief"
]
negative = [
    "anger","annoyance","confusion","disappointment","disapproval","disgust",
    "embarrassment","fear","grief","nervousness","remorse","sadness"
]
neutral = ["realization","surprise","neutral"]

### Assigning the Dataset Overall Emotion Labels

In [4]:
def Emotion_Labels(row) :
    if row[positive].sum() > 0 :
        return "Positive"
    elif row[negative].sum() > 0 :
        return "Negative"
    elif row[neutral].sum() > 0 :
        return "Neutral"
    else :
        return pd.NA

df['Emotions'] = df.apply(Emotion_Labels,axis=1)
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,Negative
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,Neutral


### Checking for the Null Values and Dropping them if any

In [5]:
df.isna().sum()

text                       0
id                         0
author                     0
subreddit                  0
link_id                    0
parent_id                  0
created_utc                0
rater_id                   0
example_very_unclear       0
admiration                 0
amusement                  0
anger                      0
annoyance                  0
approval                   0
caring                     0
confusion                  0
curiosity                  0
desire                     0
disappointment             0
disapproval                0
disgust                    0
embarrassment              0
excitement                 0
fear                       0
gratitude                  0
grief                      0
joy                        0
love                       0
nervousness                0
optimism                   0
pride                      0
realization                0
relief                     0
remorse                    0
sadness       

In [6]:
df.dropna(inplace=True)
df.isna().sum()

text                    0
id                      0
author                  0
subreddit               0
link_id                 0
parent_id               0
created_utc             0
rater_id                0
example_very_unclear    0
admiration              0
amusement               0
anger                   0
annoyance               0
approval                0
caring                  0
confusion               0
curiosity               0
desire                  0
disappointment          0
disapproval             0
disgust                 0
embarrassment           0
excitement              0
fear                    0
gratitude               0
grief                   0
joy                     0
love                    0
nervousness             0
optimism                0
pride                   0
realization             0
relief                  0
remorse                 0
sadness                 0
surprise                0
neutral                 0
Emotions                0
dtype: int64

### Finding out the Value Counts of Emotions in the Dataset

In [7]:
df['Emotions'].value_counts()

Emotions
Positive    90895
Neutral     63706
Negative    53213
Name: count, dtype: int64

### Preparing the Dataset for Training and Testing

In [8]:
dff = pd.DataFrame()
dff['Text'] = (df['text'].astype(str) +
    ' | Author: ' + df['author'].astype(str) +
    ' | Subreddit: ' + df['subreddit'].astype(str)
)
dff['Emotions'] = df['Emotions']
dff.head()

Unnamed: 0,Text,Emotions
0,That game hurt. | Author: Brdd9 | Subreddit: nrl,Negative
2,"You do right, if you don't care then fuck 'em!...",Neutral
3,Man I love reddit. | Author: MrsRobertshaw | S...,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",Neutral
5,Right? Considering it’s such an important docu...,Positive


In [9]:
del df1,df2,df3,df

### Cleaning the Text

In [10]:
def clean(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\br/\w+', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'[{}\[\]()\|:\"\']', ' ', text)
    text = re.sub(r'[“”‘’]', ' ', text)            
    text = re.sub(r'\|{2,}', ' ', text)              
    text = nfx.remove_emails(text)
    text = nfx.remove_stopwords(text)
    text = nfx.remove_urls(text)
    text = nfx.remove_userhandles(text)
    text = nfx.remove_phone_numbers(text)
    text = nfx.remove_emojis(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_multiple_spaces(text)
    text = re.sub(r'[^a-z0-9 ]', ' ', text)           
    text = re.sub(r'\s{2,}', ' ', text)              
    return text.strip()

dff['Text'] = dff['Text'].apply(clean)
dff.head()

Unnamed: 0,Text,Emotions
0,game hurt author brdd9 subreddit nrl,Negative
2,right care fuck em author labalool subreddit c...,Neutral
3,man love reddit author mrsrobertshaw subreddit...,Positive
4,near them falcon author americanfascist713 sub...,Neutral
5,right considering important document know damn...,Positive


### Splitting the Dataset into Training and Testing Sets

In [11]:
x = dff['Text']
y = dff['Emotions']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

### Shape of the Training and Testing Sets

In [12]:
print('X-Train Shape :')
x_train.shape

X-Train Shape :


(166251,)

In [13]:
print('Y-Train Shape :')
y_train.shape

Y-Train Shape :


(166251,)

In [14]:
print('X-Test Shape :')
x_test.shape

X-Test Shape :


(41563,)

In [15]:
print('Y-Test Shape :')
y_test.shape

Y-Test Shape :


(41563,)

### Creation of the Pipeline

In [16]:
# Logistic Regression
log_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.5, stop_words='english')),
    ('clf', LogisticRegression(C=1, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=300, random_state=42))
])

In [17]:
# XGBosst
xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, stop_words='english')),
    ('clf', XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,
                         objective='multi:softmax', eval_metric='mlogloss',
                         use_label_encoder=False, random_state=42))
])

In [18]:
# LightBGM
lgbm_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, stop_words='english')),
    ('clf', LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,
                           class_weight='balanced', random_state=42, verbosity=-1))
])

### Training the Models

In [19]:
# Logistic Regression
log_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,300


In [20]:
# XGBoost
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

xgb_pipe.fit(x_train,y_train_encoded)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
# LightBGM
lgbm_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


### Saving the Pipelines

In [22]:
# Logistic Regression
joblib.dump(log_pipe, 'lor_pipe.joblib')

# XGBoost
joblib.dump(xgb_pipe, 'xgb_pipe.joblib')

# LightBGM
joblib.dump(lgbm_pipe, 'lgbm_pipe.joblib')

['lgbm_pipe.joblib']

### Saving the Label Encoder and Test Dataset

In [23]:
# Label Encoder
joblib.dump(le, 'label_encoder.joblib')

['label_encoder.joblib']

In [24]:
x_test.to_csv('x_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)