In [2]:
import warnings
warnings.filterwarnings('ignore')

#Visualization libraries
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#For more info on arguments of seaborn.set()
#https://seaborn.pydata.org/generated/seaborn.set.html

#Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

#Increase the default plot size and set the color scheme
plt.rcParams['figure.figsize'] = (8,5)
#This works as well - plt.rcParams['figure.figsize'] = 8,5
plt.rcParams['image.cmap'] = 'viridis'

import pandas as pd
import numpy as np

## Data Processing

In [3]:
data_dir = "data/text_emotion.csv"
data = pd.read_csv(data_dir)

In [5]:
data.shape

(40000, 4)

In [6]:
data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [13]:
data.describe(include="all")

Unnamed: 0,tweet_id,sentiment,author,content
count,40000.0,40000,40000,40000
unique,,13,33871,39827
top,,neutral,MissxMarisa,I just received a mothers day card from my lov...
freq,,8638,23,14
mean,1845184000.0,,,
std,118857900.0,,,
min,1693956000.0,,,
25%,1751431000.0,,,
50%,1855443000.0,,,
75%,1962781000.0,,,


In [15]:
data.sentiment.value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [17]:
data= data[data.sentiment.isin(['happiness', 'sadness', 'hate', 'anger', 'fun', 'love'])]

In [18]:
data.describe(include="all")

Unnamed: 0,tweet_id,sentiment,author,content
count,17425.0,17425,17425,17425
unique,,6,15860,17345
top,,happiness,MiDesfileNegro,I just received a mothers day card from my lov...
freq,,5209,11,13
mean,1832988000.0,,,
std,117356500.0,,,
min,1693956000.0,,,
25%,1751289000.0,,,
50%,1753586000.0,,,
75%,1962267000.0,,,


#### Lower Case

In [20]:
data.content = data.content.apply(lambda x: " ".join(word.lower() for word in x.split()))

In [21]:
data.content.head()

1    layin n bed with a headache ughhhh...waitin on...
2                  funeral ceremony...gloomy friday...
6    i should be sleep, but im not! thinking about ...
8              @charviray charlene my love. i miss you
9            @kelcouch i'm sorry at least it's friday?
Name: content, dtype: object

#### Punctuation

In [22]:
data.content = data.content.str.replace("[^\w\s]", " ")

In [23]:
data.content

1        layin n bed with a headache ughhhh   waitin on...
2                      funeral ceremony   gloomy friday   
6        i should be sleep  but im not  thinking about ...
8                   charviray charlene my love  i miss you
9                 kelcouch i m sorry at least it s friday 
                               ...                        
39994                        succesfully following tayla  
39996                        happy mothers day all my love
39997    happy mother s day to all the mommies out ther...
39998     niariley wassup beautiful    follow me   peep...
39999     mopedronin bullet train from tokyo the gf and...
Name: content, Length: 17425, dtype: object

#### Remove stop words

In [29]:
from nltk.corpus import stopwords
stop_words= stopwords.words("english")

In [30]:
data.content= data.content.apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

In [31]:
data.content

1                  layin n bed headache ughhhh waitin call
2                           funeral ceremony gloomy friday
6        sleep im thinking old friend want married damn...
8                             charviray charlene love miss
9                              kelcouch sorry least friday
                               ...                        
39994                          succesfully following tayla
39996                               happy mothers day love
39997    happy mother day mommies woman man long momma ...
39998    niariley wassup beautiful follow peep new hit ...
39999    mopedronin bullet train tokyo gf visiting japa...
Name: content, Length: 17425, dtype: object

#### Lemmatize

In [34]:
from textblob import Word
data.content = data.content.apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [35]:
data.content

1                  layin n bed headache ughhhh waitin call
2                           funeral ceremony gloomy friday
6        sleep im thinking old friend want married damn...
8                             charviray charlene love miss
9                              kelcouch sorry least friday
                               ...                        
39994                          succesfully following tayla
39996                                happy mother day love
39997    happy mother day mommy woman man long momma so...
39998    niariley wassup beautiful follow peep new hit ...
39999    mopedronin bullet train tokyo gf visiting japa...
Name: content, Length: 17425, dtype: object

#### Remove rare words

In [50]:
rare_words = pd.Series(" ".join(data.content).split()).value_counts()[-10000:]

In [51]:
data.content = data.content.apply(lambda x: " ".join(word for word in x.split() if word not in rare_words))

In [52]:
data.content

1                        n bed headache ughhhh waitin call
2                           funeral ceremony gloomy friday
6        sleep im thinking old friend want married damn...
8                                                love miss
9                              kelcouch sorry least friday
                               ...                        
39994                          succesfully following tayla
39996                                happy mother day love
39997    happy mother day mommy woman man long momma so...
39998    wassup beautiful follow peep new hit single ww...
39999    mopedronin train tokyo gf visiting japan since...
Name: content, Length: 17425, dtype: object

#### Encode output variable

In [53]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y =encoder.fit_transform(data.sentiment.values)

In [54]:
y

array([5, 5, 5, ..., 4, 2, 4])

#### Dataset split

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.content.values, y, random_state = 100, 
                                                    test_size=0.1, shuffle=True, stratify=y)

### Feature Extraction

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(max_features=1000, analyzer="word", ngram_range=(1,3))
train_tf = tfidf.fit_transform(X_train)
test_tf= tfidf.fit_transform(X_test)

In [62]:
test_tf

<1743x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 9782 stored elements in Compressed Sparse Row format>

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(analyzer="word")
cvec.fit(data.content)
train_cvec = cvec.transform(X_train)
test_cvec=cvec.transform(X_test)

### Build Model

In [65]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.metrics import f1_score as f1
from sklearn.metrics import roc_curve

In [77]:
def calculate_metrics(model_name, y_test, y_pred):
    accuracy_nb = accuracy_score(y_test,y_pred)
    recall_nb = recall(y_test,y_pred, average="micro")
    precision_nb = precision(y_test,y_pred, average="micro")
    f1_nb = f1(y_test,y_pred, average="micro")
    print(f"{model_name} \naccuracy: {accuracy_nb}\nrecall: {recall_nb} \nprecision: {precision_nb}\nf1_score: {f1_nb}")

In [78]:
from sklearn.naive_bayes import MultinomialNB
nb= MultinomialNB()
nb.fit(train_tf, y_train)
y_pred= nb.predict(test_tf)
calculate_metrics("Naive Bayes Classifer", y_test, y_pred)

Naive Bayes Classifer 
accuracy: 0.33103843947217443
recall: 0.33103843947217443 
precision: 0.33103843947217443
f1_score: 0.33103843947217443
