# Introducing the Naive Bayes Classifier

Now we will use annotated data to "learn" a sentiment classifier

In [25]:
# We first install the new dependency: nlpia (03_dit_coli_naivebayes.ipynb)
! pip3 install nlpia







In [26]:
# Loading the dependencies
import pandas as pd

from collections import Counter

from nlpia.data.loaders import get_data 

# The casual tokenizer can handle emoticons, unusual punctuation and slang better than the TreeBank tokenizer
from nltk.tokenize import casual_tokenize

## Setting up the "corpus"

Loading the movies corpus from Hutto movies

In [32]:
movies = get_data('hutto_movies')

# Looking at some of the first instances
movies.head().round(2)

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


### Getting a description of the data (look at the range)

In [28]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [33]:
# Helps display wide DataFrames in the console, so they look prettier
pd.set_option('display.width', 75)
movies.sentiment

id
1        2.266667
2        3.533333
3       -0.600000
4        1.466667
5        1.733333
           ...   
10601   -0.062500
10602   -1.500000
10603   -0.625000
10604    1.437500
10605   -1.812500
Name: sentiment, Length: 10605, dtype: float64

### Loading the data into a DataFrame through a list of dictionaries

In [36]:
bags_of_words = []

for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

df_bows = pd.DataFrame.from_records(bags_of_words)

# from_records() is a DataFrame constructor.
# INPUT: a sequence (list) of dictionaries
# OUTPUT: a DF with columns for all the keys and associated values. 
# (Missing values become NaN!)
print(df_bows)

       The  Rock   is  destined   to   be  the  21st  Century's  new  \
0      1.0   1.0  1.0       1.0  2.0  1.0  1.0   1.0        1.0  1.0   
1      2.0   NaN  1.0       NaN  NaN  NaN  1.0   NaN        NaN  NaN   
2      NaN   NaN  NaN       NaN  NaN  NaN  NaN   NaN        NaN  NaN   
3      NaN   NaN  1.0       NaN  4.0  NaN  1.0   NaN        NaN  NaN   
4      NaN   NaN  NaN       NaN  NaN  NaN  NaN   NaN        NaN  NaN   
...    ...   ...  ...       ...  ...  ...  ...   ...        ...  ...   
10600  NaN   NaN  NaN       NaN  NaN  NaN  NaN   NaN        NaN  NaN   
10601  NaN   NaN  NaN       NaN  NaN  NaN  NaN   NaN        NaN  NaN   
10602  NaN   NaN  NaN       NaN  NaN  NaN  NaN   NaN        NaN  NaN   
10603  NaN   NaN  NaN       NaN  NaN  NaN  2.0   NaN        NaN  NaN   
10604  NaN   NaN  NaN       NaN  NaN  NaN  2.0   NaN        NaN  NaN   

       ...  Ill  slummer  Rashomon  dipsticks  Bearable  Staggeringly  \
0      ...  NaN      NaN       NaN        NaN       NaN       

In [37]:
# So we fill them with 0:
df_bows = df_bows.fillna(0).astype(int)
print(df_bows)

       The  Rock  is  destined  to  be  the  21st  Century's  new  ...  \
0        1     1   1         1   2   1    1     1          1    1  ...   
1        2     0   1         0   0   0    1     0          0    0  ...   
2        0     0   0         0   0   0    0     0          0    0  ...   
3        0     0   1         0   4   0    1     0          0    0  ...   
4        0     0   0         0   0   0    0     0          0    0  ...   
...    ...   ...  ..       ...  ..  ..  ...   ...        ...  ...  ...   
10600    0     0   0         0   0   0    0     0          0    0  ...   
10601    0     0   0         0   0   0    0     0          0    0  ...   
10602    0     0   0         0   0   0    0     0          0    0  ...   
10603    0     0   0         0   0   0    2     0          0    0  ...   
10604    0     0   0         0   0   0    2     0          0    0  ...   

       Ill  slummer  Rashomon  dipsticks  Bearable  Staggeringly  ’  ve  \
0        0        0         0       

### Let us look at the shape

Spoiler: A BoW can explode in size; even more when no normalisation is applied at all


In [38]:
df_bows.shape

(10605, 20756)

Now, let us see the first instances (it is quite sparse)

In [39]:
df_bows.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



**Homework**: Integrate the normalisation pipeline (lowercasing, stopwording and stemming or lemmatisation) and see how the dataframe gets affected

In [40]:
# write your code here
None

In [42]:
print(df_bows.head()[list(bags_of_words[0].keys())])
print(df_bows.head()[list(bags_of_words[1].keys())])

   The  Rock  is  destined  to  be  the  21st  Century's  new  ...  \
0    1     1   1         1   2   1    1     1          1    1  ...   
1    2     0   1         0   0   0    1     0          0    0  ...   
2    0     0   0         0   0   0    0     0          0    0  ...   
3    0     0   1         0   4   0    1     0          0    0  ...   
4    0     0   0         0   0   0    0     0          0    0  ...   

   Schwarzenegger  ,  Jean  Claud  Van  Damme  or  Steven  Segal  .  
0               1  1     1      1    1      1   1       1      1  1  
1               0  0     0      0    0      0   0       0      0  4  
2               0  0     0      0    0      0   0       0      0  0  
3               0  1     0      0    0      0   0       0      0  1  
4               0  1     0      0    0      0   0       0      0  1  

[5 rows x 33 columns]
   The  gorgeously  elaborate  continuation  of  '  Lord  the  Rings  \
0    1           0          0             0   0  4     0    1   

### Build the Naive Bayes classifier

All the data is now ready. Let us build a Multinomial NB.

Multinomial NB is suitable for discrete features (e.g., word counts for text classification). 

In [43]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [44]:
# "Binarising" the classes
movies.sentiment > 0

id
1         True
2         True
3        False
4         True
5         True
         ...  
10601    False
10602    False
10603    False
10604     True
10605    False
Name: sentiment, Length: 10605, dtype: bool

Now we can train ("fit") our model

In [45]:
# We are converting the class from float to Boolean, 
# as this classifier only supports discrete labels 
nb = nb.fit(df_bows, movies.sentiment > 0)

### We have a model and we can predict!

In [46]:
# predict_proba() gets continious-value predictions.
# We multiply and subtract it to convert the output to range [-4,4]

#print(predictions[:10])
# TODO there seems to be an error in th ebook code. 
# predict_proba returns the scores for all the classes (2) and we aim at
# assigning only the one for the positive class. 
# I had to to the following trick instead of the original
# movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
predictions = nb.predict_proba(df_bows) * 8 - 4 
movies['predicted_sentiment'] = [x[1] for x in predictions]

movies

Unnamed: 0_level_0,sentiment,text,predicted_sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.266667,The Rock is destined to be the 21st Century's ...,2.511515
2,3.533333,The gorgeously elaborate continuation of ''The...,3.999904
3,-0.600000,Effective but too tepid biopic,-3.655976
4,1.466667,If you sometimes like to go to the movies to h...,1.940954
5,1.733333,"Emerges as something rare, an issue movie that...",3.910373
...,...,...,...
10601,-0.062500,Well made but mush hearted.,-3.166489
10602,-1.500000,A real snooze.,-1.056805
10603,-0.625000,No surprises.,-1.481449
10604,1.437500,We’ve seen the hippie turned yuppie plot befor...,3.988988


In [None]:
abs(n)

abs(5) -> 5
abs(-34) -> 34
abs(0) -> 0

In [None]:
1    0.5   -> 0.5
0.5  1     -> 0.5

0/2 -> 0

(0.5+0.5)/2 -> 0.5


Now, we compute the Mean Absolut Error (MAE) "a measure of difference between two continuous variables"

In [47]:

movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
# This is the mean absolute error (MAE)
movies.error.mean().round(1)


1.9

Now, let us see some gold and predicted sentiments, together with the binary classification

In [48]:
# Gold standard is positive
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)

# Prediction is positive
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)

# Let us have an overview of gold standard vs prediction
movies['''sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'''.split()].head(8)

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,2.511515,1,1
2,3.533333,3.999904,1,1
3,-0.6,-3.655976,0,0
4,1.466667,1.940954,1,1
5,1.733333,3.910373,1,1
6,2.533333,3.995188,1,1
7,2.466667,3.960466,1,1
8,1.266667,-1.918701,1,0


In [49]:
# And this is the percentage of "thumbs up" rating correctly predicted    
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)


0.9344648750589345

## not bad at all!