In [192]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

In [193]:
df = pd.read_csv('train.txt' , sep=';' , header=None , names=['Text' , 'Emotion'])

In [194]:
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [195]:
emotions_values = df['Emotion'].unique()

In [196]:
emotions = { emotions_values[i]:i for i in range(len(emotions_values))}

In [197]:
emotions

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [198]:
df['Emotion'] = df['Emotion'].map(emotions)

In [199]:
df['Text'] = df['Text'].apply(lambda txt : txt.lower())

In [200]:
# remove punctuations 
import string 
def remove_punctuations (txt):
    return txt.translate(str.maketrans(' ' , ' ' , string.punctuation))

In [201]:
df['Text'] = df['Text'].apply(remove_punctuations)

In [202]:
# remove no .
def remove_num(txt):
    new = "" 
    for i in txt :
        if not i.isdigit():
            new+=i
    return new 
df["Text"] = df['Text'].apply(remove_num)

In [203]:
# remove emojis 
def remove_emoji(txt):
    new = ""
    for i in txt :
        if i.isascii():
            new += i 
    return new 

df['Text'] = df['Text'].apply(remove_emoji)

In [204]:
import nltk 

In [205]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [206]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [207]:
stop_words = set(stopwords.words('english'))

In [208]:
def remove_stopwords (txt):
    words = txt.split()
    cleaned = []
    for i in words :
        if not i in stop_words:
            cleaned.append(i)
    return ' '.join(cleaned)


In [209]:
df['Text'] = df['Text'].apply(remove_stopwords)

In [210]:
# bag of word 
from sklearn.feature_extraction.text import CountVectorizer
vc = CountVectorizer()
X = vc.fit_transform(df['Text'])

print(f'vocablary : {vc.get_feature_names_out()}')
print(f'\nBow matrix : {X.toarray()}')

vocablary : ['aa' 'aaaaaaand' 'aaaaand' ... 'zum' 'zumba' 'zz']

Bow matrix : [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [211]:
# tf-idf - term frequency - inverse document frequency
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer() 

X = tfidf_vectorizer.fit_transform(df['Text'])

print(f'vocablary : {tfidf_vectorizer.get_feature_names_out()}')
print(f'\nBow matrix : {X.toarray()}')


vocablary : ['aa' 'aaaaaaand' 'aaaaand' ... 'zum' 'zumba' 'zz']

Bow matrix : [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [212]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train , y_test = train_test_split(df['Text'] , df['Emotion'] , test_size=0.2 , random_state=42)

In [213]:
X_train_bow = vc.fit_transform(X_train)
X_test_bow = vc.transform(X_test)

In [214]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nb_model= MultinomialNB()
nb_model.fit(X_train_bow , y_train)

In [215]:
preb_nb = nb_model.predict(X_test_bow)

In [216]:
accuracy_score(y_test , preb_nb)

0.768125

In [217]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [218]:
nb_model_2 = MultinomialNB()

In [219]:
nb_model_2.fit(X_train_tfidf,y_train)
preb_nb_2 = nb_model_2.predict(X_test_tfidf)
accuracy_score(y_test, preb_nb_2)

0.6609375

In [220]:
from sklearn.linear_model import LogisticRegression
model_lg_1 = LogisticRegression(max_iter=1000)
model_lg_2 = LogisticRegression(max_iter=1000)

In [221]:
model_lg_1.fit(X_train_bow, y_train)
model_lg_2.fit(X_train_tfidf , y_train)

In [222]:
lg_pred_1 = model_lg_1.predict(X_test_bow)
lg_pred_2 = model_lg_2.predict(X_test_tfidf)

In [223]:
accuracy_score(y_test, lg_pred_1)

0.8896875

In [224]:
accuracy_score(y_test,lg_pred_2)

0.8628125