In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv("train.txt", sep = ";", header = None, names=["text", "emotion"])

In [5]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [7]:
df["emotion"].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [8]:
emotion_count = df['emotion'].value_counts()
emotion_count

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937
love,1304
surprise,572


In [9]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [10]:
df["text"] = df['text'].apply(lambda x : x . lower())

In [11]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

In [12]:
df['text'] = df['text'].apply(remove_punc)

In [13]:
def remove_number(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_number)

In [14]:
def remove_emojis(txt):
  new = ""
  for i in txt:
    if i.isascii():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_emojis)

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
len(stop_words)

198

In [19]:
df.loc[1]["text"]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [20]:
def remove_stopwords(txt):
  words = txt.split()
  cleaned = []
  for word in words:
    if word not in stop_words:
      cleaned.append(word)
  return " ".join(cleaned)

In [21]:
df['text'] = df['text'].apply(remove_stopwords)

In [22]:
df.loc[1]["text"]

'go feeling hopeless damned hopeful around someone cares awake'

In [23]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [26]:
X_train

Unnamed: 0,text
676,refers course though cant help feeling somehow...
12113,im starting feel im suffering fatigue
7077,feel like probably would liked book little bit...
13005,really feel awkward
12123,im feeling little grumpy today lame weather te...
...,...
13418,love leave reader feeling confused slightly de...
5390,feel delicate
860,starting feel little stressed
15795,feel stressed tired worn shape neglected


In [27]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print(le.classes_)


['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [28]:


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train_enc)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test_enc, pred_bow))


0.768125


In [29]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb_model2 = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train_enc)


pred_bow = nb_model.predict(X_test_tfidf)
print(accuracy_score(y_test_enc, pred_bow))


0.6609375


In [30]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(
    max_iter=2000,
    multi_class='auto',
    n_jobs=-1
)
logistic_model.fit(X_train_tfidf,y_train_enc)

log_pred = logistic_model.predict(X_test_tfidf)
print(accuracy_score(y_test_enc,log_pred ))

0.8628125


In [31]:
def predict_emotion(text):
    text_vec = tfidf_vectorizer.transform([text])
    pred = logistic_model.predict(text_vec)
    return le.inverse_transform(pred)[0]

print(predict_emotion("I feel very happy today"))
print(predict_emotion("I am feeling very sad"))
print(predict_emotion("I am angry with you"))
print(predict_emotion("I love my family"))
print(predict_emotion("I am scared of exams"))

joy
sadness
anger
joy
fear


In [32]:
import joblib

joblib.dump(tfidf_vectorizer, "tfidf.joblib")
joblib.dump(logistic_model, "emotion_model.joblib")
joblib.dump(le, "label_encoder.joblib")

print("All files saved successfully using joblib")


All files saved successfully
