### Importing Python Libraries

In [1]:
import numpy as np
import pandas as pd
import neattext.functions as nfx
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import joblib

### Loading the Datasets and Merging them to Single Dataset

In [2]:
df1 = pd.read_csv('../../../Datasets/goemotions_1.csv')
df2 = pd.read_csv('../../../Datasets/goemotions_2.csv')
df3 = pd.read_csv('../../../Datasets/goemotions_3.csv')

df = pd.concat([df1,df2,df3],axis=0)
df.sample(10)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
18083,Yeah...when can the neckbeard asking this ques...,edconw4,Skankinzombie22,entertainment,t3_acuqzs,t1_edazts4,1546744000.0,26,False,0,...,0,0,0,0,0,0,0,0,0,1
62843,That really sucks. How old is your baby?,ed4coog,JMO130,breakingmom,t3_ac0a53,t3_ac0a53,1546480000.0,56,False,0,...,0,0,0,0,0,0,0,0,0,0
1833,Back at it with the crazy [RELIGION] conspirac...,eem0tlc,negev67,TrueReddit,t3_ai16xe,t1_eel9mdj,1548071000.0,46,False,0,...,0,0,0,0,0,0,0,0,0,0
41754,"Is that you, [NAME]?",edeh8k8,the_unseen_one,MensRights,t3_ad4vrh,t1_eddq72h,1546799000.0,70,False,0,...,0,0,0,0,0,0,0,0,0,0
68103,Omg I know. She honestly looks so uncomfortabl...,ed1uxk0,123itsbritneybitch,90DayFiance,t3_abjgb5,t1_ed1ttbv,1546393000.0,57,False,0,...,0,0,0,0,0,0,0,0,0,0
61892,Cat looks like it’s going to tell you about ho...,ed82o39,Krsto7,Justfuckmyshitup,t3_aciaq9,t3_aciaq9,1546610000.0,57,False,0,...,0,0,0,0,0,0,0,0,0,1
18301,It’s perfect!,edgi36m,licksipsick,adultery,t3_addzmu,t1_edg7fl7,1546850000.0,8,False,1,...,0,0,0,0,0,0,0,0,0,0
34877,"This is sad but not cringe, I hope he finds so...",ed5plq4,alexander220204,sadcringe,t3_ac6js5,t3_ac6js5,1546533000.0,72,False,0,...,0,0,0,0,0,0,0,0,0,1
15318,My crazy idea ... join the PAC-12 in a combine...,eej7icr,CreativeIronicHandle,CollegeBasketball,t3_ahv3u2,t1_eeimar7,1547994000.0,4,False,0,...,0,0,0,0,0,0,0,0,0,1
63826,Glad to see you were able to get this worked o...,edi72ov,Araxom,Overwatch,t3_ac298q,t1_edgh83v,1546894000.0,4,False,0,...,0,0,0,0,0,0,0,0,0,0


### Listing the Emotions Columns by Type

In [3]:
positive = [
    "admiration","amusement","approval","caring","curiosity","desire","excitement",
    "gratitude","joy","love","optimism","pride","relief"
]
negative = [
    "anger","annoyance","confusion","disappointment","disapproval","disgust",
    "embarrassment","fear","grief","nervousness","remorse","sadness"
]
neutral = ["realization","surprise","neutral"]

### Assigning the Dataset Overall Emotion Labels

In [4]:
def Emotion_Labels(row) :
    if row[positive].sum() > 0 :
        return "Positive"
    elif row[negative].sum() > 0 :
        return "Negative"
    elif row[neutral].sum() > 0 :
        return "Neutral"
    else :
        return pd.NA

df['Emotions'] = df.apply(Emotion_Labels,axis=1)
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,Negative
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,Neutral


### Checking for the Null Values and Dropping them if any

In [5]:
df.isna().sum()

text                       0
id                         0
author                     0
subreddit                  0
link_id                    0
parent_id                  0
created_utc                0
rater_id                   0
example_very_unclear       0
admiration                 0
amusement                  0
anger                      0
annoyance                  0
approval                   0
caring                     0
confusion                  0
curiosity                  0
desire                     0
disappointment             0
disapproval                0
disgust                    0
embarrassment              0
excitement                 0
fear                       0
gratitude                  0
grief                      0
joy                        0
love                       0
nervousness                0
optimism                   0
pride                      0
realization                0
relief                     0
remorse                    0
sadness       

In [6]:
df.dropna(inplace=True)
df.isna().sum()

text                    0
id                      0
author                  0
subreddit               0
link_id                 0
parent_id               0
created_utc             0
rater_id                0
example_very_unclear    0
admiration              0
amusement               0
anger                   0
annoyance               0
approval                0
caring                  0
confusion               0
curiosity               0
desire                  0
disappointment          0
disapproval             0
disgust                 0
embarrassment           0
excitement              0
fear                    0
gratitude               0
grief                   0
joy                     0
love                    0
nervousness             0
optimism                0
pride                   0
realization             0
relief                  0
remorse                 0
sadness                 0
surprise                0
neutral                 0
Emotions                0
dtype: int64

### Finding out the Value Counts of Emotions in the Dataset

In [7]:
df['Emotions'].value_counts()

Emotions
Positive    90895
Neutral     63706
Negative    53213
Name: count, dtype: int64

### Balancing the Class

In [8]:
target = 53000
classes = ['Positive', 'Neutral', 'Negative']
balanced_frames = []
for cls in classes:
    cls_df = df[df['Emotions'] == cls]
    if len(cls_df) >= target:
        cls_df_bal = cls_df.sample(n=target, random_state=42)
    else:
        cls_df_bal = cls_df.sample(n=target, replace=True, random_state=42)
    balanced_frames.append(cls_df_bal)

In [9]:
bdf = pd.concat(balanced_frames).sample(frac=1, random_state=42).reset_index(drop=True)
bdf['Emotions'].value_counts()

Emotions
Neutral     53000
Negative    53000
Positive    53000
Name: count, dtype: int64

### Preparing the Dataset for Training and Testing

In [10]:
dff = pd.DataFrame()
dff['Text'] = (bdf['text'].astype(str) +
    ' | Author: ' + bdf['author'].astype(str) +
    ' | Subreddit: ' + bdf['subreddit'].astype(str)
)
dff['Emotions'] = bdf['Emotions']
dff.head()

Unnamed: 0,Text,Emotions
0,Some couples have a “cheat list” of celebritie...,Neutral
1,"I am 63, D cups, no sag. I must be a unicorn! ...",Neutral
2,Unfortunately no one usually fights for their ...,Negative
3,Is everybody forgetting that bills need to be ...,Negative
4,"Nice, congrats! | Author: sarcasmbunny | Subre...",Positive


In [11]:
del df1,df2,df3,df,bdf

### Cleaning the Text

In [12]:
def clean(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\br/\w+', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'[{}\[\]()\|:\"\']', ' ', text)
    text = re.sub(r'[“”‘’]', ' ', text)            
    text = re.sub(r'\|{2,}', ' ', text)              
    text = nfx.remove_emails(text)
    text = nfx.remove_stopwords(text)
    text = nfx.remove_urls(text)
    text = nfx.remove_userhandles(text)
    text = nfx.remove_phone_numbers(text)
    text = nfx.remove_emojis(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_multiple_spaces(text)
    text = re.sub(r'[^a-z0-9 ]', ' ', text)           
    text = re.sub(r'\s{2,}', ' ', text)              
    return text.strip()

dff['Text'] = dff['Text'].apply(clean)
dff.head()

Unnamed: 0,Text,Emotions
0,couples cheat list celebrities entitled with b...,Neutral
1,63 cups sag unicorn author getoffmylawn subred...,Neutral
2,unfortunately usually fights rights martyr aut...,Negative
3,everybody forgetting bills need signed preside...,Negative
4,nice congrats author sarcasmbunny subreddit as...,Positive


In [13]:
dff['Emotions'].value_counts()

Emotions
Neutral     53000
Negative    53000
Positive    53000
Name: count, dtype: int64

### Splitting the Dataset into Training and Testing Sets

In [14]:
x = dff['Text']
y = dff['Emotions']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

### Shape of the Training and Testing Sets

In [15]:
print('X-Train Shape :')
x_train.shape

X-Train Shape :


(127200,)

In [16]:
print('Y-Train Shape :')
y_train.shape

Y-Train Shape :


(127200,)

In [17]:
print('X-Test Shape :')
x_test.shape

X-Test Shape :


(31800,)

In [18]:
print('Y-Test Shape :')
y_test.shape

Y-Test Shape :


(31800,)

### Creation of the Pipeline

In [19]:
# Logistic Regression
log_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.5, stop_words='english')),
    ('clf', LogisticRegression(C=1, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=300, random_state=42))
])

In [20]:
# XGBosst
xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, stop_words='english')),
    ('clf', XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,
                         objective='multi:softmax', eval_metric='mlogloss',
                         use_label_encoder=False, random_state=42))
])

In [21]:
# LightBGM
lgbm_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, stop_words='english')),
    ('clf', LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,
                           class_weight='balanced', random_state=42, verbosity=-1))
])

### Training the Models

In [22]:
# Logistic Regression
log_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,300


In [23]:
# XGBoost
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

xgb_pipe.fit(x_train,y_train_encoded)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [24]:
# LightBGM
lgbm_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


### Saving the Pipelines, Label Encoder and Test Dataset

In [25]:
# Logistic Regression
joblib.dump(log_pipe, '../Models/log_pipe.joblib')

# XGBoost
joblib.dump(xgb_pipe, '../Models/xgb_pipe.joblib')

# LightBGM
joblib.dump(lgbm_pipe, '../Models/lgbm_pipe.joblib')

# Label Encoder
joblib.dump(le, '../Models/label_encoder.joblib')

# Test Dataset
x_test.to_csv('../Test Datasets/x_test.csv', index=False)
y_test.to_csv('../Test Datasets/y_test.csv', index=False)