**Step 1: Setting up the libraries**

In [31]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Step 2: Choosing our Dataset**


Importing dataset

In [32]:
data = pd.read_csv('text_emotion.csv')
data.head()

Unnamed: 0,tweet_id,emotion,author,content
0,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
1,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
2,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
3,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you
4,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?


 Dropping rows with other emotion labels

In [33]:
data = data.drop(data[data.emotion == 'anger'].index)
data = data.drop(data[data.emotion == 'boredom'].index)
data = data.drop(data[data.emotion == 'enthusiasm'].index)
data = data.drop(data[data.emotion == 'empty'].index)
data = data.drop(data[data.emotion == 'fun'].index)
data = data.drop(data[data.emotion == 'relief'].index)
data = data.drop(data[data.emotion == 'surprise'].index)
data = data.drop(data[data.emotion == 'love'].index)
data = data.drop(data[data.emotion == 'hate'].index)
data = data.drop(data[data.emotion == 'neutral'].index)
data = data.drop(data[data.emotion == 'worry'].index)

**Step 3: Preprocessing the Data**

Making all letters lowercase

In [34]:
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))


Removing Punctuation, Symbols

In [35]:
data['content'] = data['content'].str.replace('[^\w\s]',' ')

  data['content'] = data['content'].str.replace('[^\w\s]',' ')


 Removing Stop Words using NLTK

In [36]:
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

Lemmatisation

In [37]:
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

Correcting Letter Repetitions

In [38]:
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [39]:
data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))


Code to find the top 10,000 rarest words appearing in the data

In [40]:
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

Removing all those rarely appearing words from the data

In [41]:
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))


**Step 4: Feature Extraction**


Encoding output labels 'sadness' as '1' & 'happiness' as '0'

In [42]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.emotion.values)

Splitting into training and testing data in 90:10 ratio

In [43]:
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)


 Extracting TF-IDF parameters

In [44]:
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

 Extracting Count Vectors Parameters

In [45]:
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

**Step 5: Training Our Models**


Model 1: Multinomial Naive Bayes Classifier

In [46]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes tfidf accuracy 0.48265895953757226


Model 2: Linear SVM

In [47]:
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

svm using tfidf accuracy 0.5096339113680154


Model 3: logistic regression

In [48]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))

log reg tfidf accuracy 0.4951830443159923


Model 4: Random Forest Classifier

In [49]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))

random forest tfidf accuracy 0.4913294797687861


**Building models using count vectors feature**

Model 1: Multinomial Naive Bayes Classifier

In [50]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.7745664739884393


Model 2: Linear SVM

In [51]:
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.7832369942196532


Model 3: Logistic Regression

In [None]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

Model 4: Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness


In [27]:
tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
'Things are looking great. It was such a good day',
'Success is right around the corner. Lets celebrate this victory',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst, okay? But I am gonna get better.',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite depressing. I am filled with sorrow',
'His death broke my heart. It was a sad day'])


Doing some preprocessing on these tweets as done before

In [28]:
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

  tweets[0] = tweets[0].str.replace('[^\w\s]',' ')


 Extracting Count Vectors feature from our tweets

In [29]:
tweet_count = count_vect.transform(tweets[0])

Predicting the emotion of the tweet using our already trained linear SVM

In [30]:
tweet_pred = lsvm.predict(tweet_count)
print(tweet_pred)

[0 0 0 0 1 1 1 1]


Note:Remember our encodings for the output. ‘0' is for happiness and ‘1’ is for sadness. Our model detected the emotion correctly for all the 8 sentences.