# Modeling

In this notebook, we explore models for the task.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load the data

In [2]:
df = pd.read_csv('../data/emotions.csv', 
                 names=['emotion', 'text', 'unnamed'])

In [3]:
df.head()

Unnamed: 0,emotion,text,unnamed
0,joy,On days when I feel close to my partner and ot...,
1,fear,Every time I imagine that someone I love or I ...,
2,anger,When I had been obviously unjustly treated and...,
3,sadness,When I think about the short time that we live...,
4,disgust,At a gathering I found myself involuntarily si...,


In [4]:
df = df[['emotion', 'text']]

In [5]:
df.head()

Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


### Mapping `emotion` to `target`

In [6]:
emotions = df['emotion'].unique()

In [7]:
emotion_map = {k:v for v, k in zip(range(len(emotions)), emotions)}

In [8]:
emotion_map

{'joy': 0,
 'fear': 1,
 'anger': 2,
 'sadness': 3,
 'disgust': 4,
 'shame': 5,
 'guilt': 6}

In [9]:
df['target'] = df['emotion'].map(emotion_map)

In [10]:
df.head()

Unnamed: 0,emotion,text,target
0,joy,On days when I feel close to my partner and ot...,0
1,fear,Every time I imagine that someone I love or I ...,1
2,anger,When I had been obviously unjustly treated and...,2
3,sadness,When I think about the short time that we live...,3
4,disgust,At a gathering I found myself involuntarily si...,4


## 2. Modeling

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
from sklearn.naive_bayes import MultinomialNB

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

### Split train and test set

In [15]:
x = df['text'].values
y = df['target'].values

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train_x, test_x, train_y, test_y = train_test_split(x, y,
                                                   test_size=0.2,
                                                   stratify=y,
                                                   random_state=1)

### Grid search

In [18]:
param_grid = {'vect__min_df': [1, 2, 3],
              'vect__ngram_range': [(1, 1), (1, 2)],
              'clf__alpha': [0.5, 0.75, 1.0]
              }

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
gs = GridSearchCV(estimator=text_clf, 
                  param_grid=param_grid,
                  cv=5,
                  scoring='accuracy') 

In [21]:
gs.fit(train_x, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vect__min_df': [1, 2, 3], 'vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': [0.5, 0.75, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [22]:
gs.best_params_

{'clf__alpha': 0.75, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}

## 3. Testing model

In [23]:
final_model = Pipeline([('vect', TfidfVectorizer(stop_words="english",
                                                min_df=1,
                                                ngram_range=(1, 2))),
                         ('clf', MultinomialNB(alpha=0.75))
                       ])

In [24]:
final_model.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...ue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.75, class_prior=None, fit_prior=True))])

In [25]:
final_model.predict(['I am ecstatic',
                     'I am sad'])

array([0, 3])

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [27]:
test_y

array([3, 2, 0, ..., 2, 6, 3])

In [28]:
preds = final_model.predict(test_x)

In [29]:
preds

array([2, 2, 0, ..., 3, 6, 3])

In [30]:
accuracy_score(test_y, preds)

0.5625

In [31]:
confusion_matrix(test_y, preds)

array([[164,  13,   8,  17,   2,   9,   6],
       [ 17, 141,  10,  13,  11,  16,   7],
       [ 19,  16,  96,  20,  18,  17,  30],
       [ 21,   8,  19, 132,   4,  15,  18],
       [ 10,  18,  30,   9, 116,  18,  12],
       [ 17,  18,  19,  10,  19,  99,  32],
       [ 11,  17,  29,  20,   6,  29,  98]])