In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
nltk.download('all')

In [4]:
training_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP Datasets/Emotions dataset/train.txt', names=['Message'], na_values=np.nan)
training_dataset

Unnamed: 0,Message
0,i didnt feel humiliated;sadness
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wro...
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy;anger
...,...
15995,i just had a very brief time in the beanbag an...
15996,i am now turning and i feel pathetic that i am...
15997,i feel strong and good overall;joy
15998,i feel like this was such a rude comment and i...


In [5]:
val_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP Datasets/Emotions dataset/val.txt', names=['Message'], na_values=np.nan)
dataset = pd.concat([training_dataset, val_dataset], ignore_index=True)
dataset

Unnamed: 0,Message
0,i didnt feel humiliated;sadness
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wro...
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy;anger
...,...
17995,im having ssa examination tomorrow in the morn...
17996,i constantly worry about their fight against n...
17997,i feel its important to share this info for th...
17998,i truly feel that if you are passionate enough...


In [6]:
dataset['Sentiment'] = dataset['Message'].str.split(';').str[1]
dataset['Message'] = dataset['Message'].str.split(';').str[0]
dataset

Unnamed: 0,Message,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
17995,im having ssa examination tomorrow in the morn...,sadness
17996,i constantly worry about their fight against n...,joy
17997,i feel its important to share this info for th...,joy
17998,i truly feel that if you are passionate enough...,joy


In [7]:
dataset['Sentiment'].value_counts()

joy         6066
sadness     5216
anger       2434
fear        2149
love        1482
surprise     653
Name: Sentiment, dtype: int64

In [8]:
emotions = ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise']

for element in emotions:
  dataset['Sentiment'].replace(element, emotions.index(element), inplace=True)

dataset['Sentiment'].value_counts()

0    6066
1    5216
2    2434
3    2149
4    1482
5     653
Name: Sentiment, dtype: int64

In [9]:
for numbers in dataset[dataset['Message'].duplicated()].index:
  dataset.drop(numbers, axis=0, inplace=True)

In [10]:
lemmatization = WordNetLemmatizer()

final_text = []
for index, row in dataset.iterrows():
  text = row['Message']
  text = re.sub('[^a-zA-z]', ' ', text)
  text = text.lower()
  text = text.split()
  useful_text = [lemmatization.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
  text_content = ' '.join(useful_text)
  final_text.append(text_content)

dataset['Processed_Text'] = final_text

In [11]:
dataset.drop('Message', axis=1, inplace=True)
dataset

Unnamed: 0,Sentiment,Processed_Text
0,1,didnt feel humiliated
1,1,go feeling hopeless damned hopeful around some...
2,2,im grabbing minute post feel greedy wrong
3,4,ever feeling nostalgic fireplace know still pr...
4,2,feeling grouchy
...,...,...
17995,1,im ssa examination tomorrow morning im quite w...
17996,0,constantly worry fight nature push limit inner...
17997,0,feel important share info experience thing
17998,0,truly feel passionate enough something stay tr...


In [12]:
create_vectors = TfidfVectorizer()

text_vectors = create_vectors.fit_transform(dataset['Processed_Text']).toarray()

In [13]:
text_vectors.shape

(17962, 14295)

In [14]:
X = text_vectors
y = dataset['Sentiment']

In [33]:
sample_size = 12600
val_size = 5400

_, X_train, _, y_train = train_test_split(X, y, stratify=y, test_size=sample_size, random_state=42)
X_train_processed, X_val_processed, y_train_processed, y_val_processed = train_test_split(X_train, y_train, stratify=y_train, test_size=val_size, random_state=21)
print(X_train_processed.shape, y_train_processed.shape, X_val_processed.shape, y_val_processed.shape)

(7200, 14295) (7200,) (5400, 14295) (5400,)


In [44]:
sampler = SMOTE(sampling_strategy='not majority', random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_processed, y_train_processed)
print(X_resampled.shape, y_resampled.shape)

(14556, 14295) (14556,)


In [45]:
print(y_resampled.value_counts(), y_val_processed.value_counts())

3    2426
1    2426
0    2426
5    2426
2    2426
4    2426
Name: Sentiment, dtype: int64 0    1819
1    1568
2     730
3     645
4     443
5     195
Name: Sentiment, dtype: int64


In [46]:
model = MultinomialNB()
model.fit(X_resampled, y_resampled)

MultinomialNB()

In [47]:
predicted_data = model.predict(X_val_processed)

In [48]:
f1 = metrics.f1_score(y_val_processed, predicted_data, average='weighted')
con_mat = metrics.confusion_matrix(y_val_processed, predicted_data)
classific_rep = metrics.classification_report(y_val_processed, predicted_data)
print(classific_rep, con_mat, f1)

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      1819
           1       0.88      0.82      0.85      1568
           2       0.80      0.82      0.81       730
           3       0.75      0.77      0.76       645
           4       0.64      0.76      0.69       443
           5       0.49      0.77      0.60       195

    accuracy                           0.81      5400
   macro avg       0.74      0.80      0.76      5400
weighted avg       0.82      0.81      0.81      5400
 [[1490   65   46   42  122   54]
 [  66 1293   65   63   39   42]
 [  32   45  601   36   12    4]
 [  19   36   27  499   13   51]
 [  62   18    9   12  337    5]
 [  13    6    6   15    5  150]] 0.8133846747350455
