In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [3]:
from sklearn.naive_bayes import MultinomialNB

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
datas = pd.read_csv('datasets/ISEAR.csv')

In [6]:
datas.head()

Unnamed: 0,0,joy,On days when I feel close to my partner and other friends. When I feel at peace with myself and also experience a close contact with people whom I regard greatly.
0,1,fear,Every time I imagine that someone I love or I ...
1,2,anger,When I had been obviously unjustly treated and...
2,3,sadness,When I think about the short time that we live...
3,4,disgust,At a gathering I found myself involuntarily si...
4,5,shame,When I realized that I was directing the feeli...


In [7]:
datas.columns

Index(['0', 'joy',
       'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.'],
      dtype='object')

In [8]:
datas.drop('0', axis=1, inplace=True)

In [9]:
datas.size

14890

In [10]:
datas.shape

(7445, 2)

In [11]:
column_name = datas.columns

In [12]:
datas = datas.rename(columns={column_name[0]: "Emotion",
                             column_name[1]: "Sentence"})

In [13]:
datas.head()

Unnamed: 0,Emotion,Sentence
0,fear,Every time I imagine that someone I love or I ...
1,anger,When I had been obviously unjustly treated and...
2,sadness,When I think about the short time that we live...
3,disgust,At a gathering I found myself involuntarily si...
4,shame,When I realized that I was directing the feeli...


Adding $joy$ back to the dataset

In [14]:
missing_data = {"Emotion": column_name[0],
                "Sentence": column_name[1]}

In [15]:
missing_data

{'Emotion': 'joy',
 'Sentence': 'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.'}

In [16]:
datas = datas.append(missing_data, ignore_index=True)

In [17]:
datas.isna().sum()

Emotion     0
Sentence    0
dtype: int64

In [18]:
datas.tail()

Unnamed: 0,Emotion,Sentence
7441,sadness,When I was ill and had to stay at the hospital...
7442,disgust,A few days back I was waiting for the bus at t...
7443,shame,A few days back I had a tutorial class and the...
7444,guilt,Once I quarrelled with my sister and after thi...
7445,joy,On days when I feel close to my partner and ot...


In [19]:
y = datas['Emotion']

In [20]:
y.head()

0       fear
1      anger
2    sadness
3    disgust
4      shame
Name: Emotion, dtype: object

In [21]:
X = datas['Sentence']

In [22]:
X.head()

0    Every time I imagine that someone I love or I ...
1    When I had been obviously unjustly treated and...
2    When I think about the short time that we live...
3    At a gathering I found myself involuntarily si...
4    When I realized that I was directing the feeli...
Name: Sentence, dtype: object

In [23]:
Counter(y)

Counter({'fear': 1063,
         'anger': 1069,
         'sadness': 1074,
         'disgust': 1059,
         'shame': 1059,
         'guilt': 1040,
         'joy': 1082})

In [24]:
tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize, stop_words='english', min_df=3, ngram_range=(1, 3))

In [25]:
X = tfidf.fit_transform(X)

In [26]:
tfidf.vocabulary_

{'time': 6373,
 'imagine': 3759,
 'love': 4228,
 'contact': 2007,
 'illness': 3755,
 ',': 119,
 'death': 2185,
 '.': 473,
 'illness ,': 3756,
 'death .': 2187,
 'obviously': 4729,
 'unjustly': 6629,
 'treated': 6488,
 'possibility': 5089,
 'think': 6337,
 'short': 5804,
 'live': 4152,
 'life': 4104,
 'did': 2263,
 'use': 6652,
 'short time': 5806,
 'think did': 6340,
 'time .': 6380,
 'short time .': 5807,
 'gathering': 3239,
 'sitting': 5863,
 'people': 4941,
 'expressed': 2726,
 'opinions': 4792,
 'considered': 2002,
 'low': 4246,
 'realized': 5315,
 'feelings': 2873,
 'discontent': 2388,
 'partner': 4885,
 'way': 6779,
 'trying': 6535,
 'blame': 1426,
 'instead': 3812,
 'feel': 2846,
 'guilty': 3474,
 'realize': 5314,
 'consider': 1999,
 'material': 4328,
 'things': 6329,
 'important': 3768,
 'relatives': 5413,
 'feel guilty': 2859,
 'relatives .': 5414,
 '. feel': 596,
 'girlfriend': 3289,
 'taken': 6212,
 'exam': 2664,
 'went': 6813,
 'parent': 4855,
 "'s": 23,
 'place': 5033,
 't

In [27]:
bayes_classification = MultinomialNB()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [29]:
bayes_classification.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
bayes_pred = bayes_classification.predict(X_test)

In [31]:
accuracy_score(y_test, bayes_pred)

0.5569280343716434