## Multinomial Naive Bayes

- Works very well with Text Data.
- Target is discrete in nature so multinomial

## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
data = pd.read_csv('horror-train.csv')
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
data.shape

(19579, 3)

In [5]:
data['author'].value_counts(normalize = True ) * 100

EAP    40.349354
MWS    30.869809
HPL    28.780837
Name: author, dtype: float64

In [6]:
data.drop('id', axis = 1, inplace = True)
data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


## Text processing 

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [12]:
data['text'][0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [9]:
tokenizer.tokenize(data['text'][0])

['This',
 'process',
 'however',
 'afforded',
 'me',
 'no',
 'means',
 'of',
 'ascertaining',
 'the',
 'dimensions',
 'of',
 'my',
 'dungeon',
 'as',
 'I',
 'might',
 'make',
 'its',
 'circuit',
 'and',
 'return',
 'to',
 'the',
 'point',
 'whence',
 'I',
 'set',
 'out',
 'without',
 'being',
 'aware',
 'of',
 'the',
 'fact',
 'so',
 'perfectly',
 'uniform',
 'seemed',
 'the',
 'wall']

In [13]:
sent = 'This is example @ regextokenizer ! #.'
sent

'This is example @ regextokenizer ! #.'

In [14]:
tokenizer.tokenize(sent)

['This', 'is', 'example', 'regextokenizer']

In [16]:
data['text_tokenized'] = data['text'].apply(lambda t : tokenizer.tokenize(t))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[This, process, however, afforded, me, no, mea..."
1,It never once occurred to me that the fumbling...,HPL,"[It, never, once, occurred, to, me, that, the,..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[In, his, left, hand, was, a, gold, snuff, box..."
3,How lovely is spring As we looked from Windsor...,MWS,"[How, lovely, is, spring, As, we, looked, from..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[Finding, nothing, else, not, even, gold, the,..."


## Stemming

In [22]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [24]:
data['text_tokenized'] = data['text_tokenized'].apply(lambda l : [stemmer.stem(word) for word in l])
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[thi, process, howev, afford, me, no, mean, of..."
1,It never once occurred to me that the fumbling...,HPL,"[it, never, onc, occur, to, me, that, the, fum..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[in, hi, left, hand, wa, a, gold, snuff, box, ..."
3,How lovely is spring As we looked from Windsor...,MWS,"[how, love, is, spring, as, we, look, from, wi..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, el, not, even, gold, the, superin..."


## convert the text_tokenized into sentences

In [25]:
data['text_sent'] = data['text_tokenized'].apply(lambda l : " ".join(l))
data.head()

Unnamed: 0,text,author,text_tokenized,text_sent
0,"This process, however, afforded me no means of...",EAP,"[thi, process, howev, afford, me, no, mean, of...",thi process howev afford me no mean of ascerta...
1,It never once occurred to me that the fumbling...,HPL,"[it, never, onc, occur, to, me, that, the, fum...",it never onc occur to me that the fumbl might ...
2,"In his left hand was a gold snuff box, from wh...",EAP,"[in, hi, left, hand, wa, a, gold, snuff, box, ...",in hi left hand wa a gold snuff box from which...
3,How lovely is spring As we looked from Windsor...,MWS,"[how, love, is, spring, as, we, look, from, wi...",how love is spring as we look from windsor ter...
4,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, el, not, even, gold, the, superin...",find noth el not even gold the superintend aba...


## Vectorization

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

In [29]:
vec.fit_transform(data['text_sent']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Save the vectorization

In [31]:
X = vec.fit_transform(data['text_sent']).toarray()
y = data['author']

## Split the data into train_test_split

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Apply Multinomial NB on X_train and y_train

In [33]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [34]:
mnb.fit(X_train, y_train)

## Perform Predictions

In [37]:
y_pred = mnb.predict(X_test)
y_pred

array(['MWS', 'EAP', 'HPL', ..., 'EAP', 'HPL', 'EAP'], dtype='<U3')

In [36]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Model Evaluation

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8323118828736806

## Inorder to increase the accuracy

1. Consider stopwards removal
2. Lowercase conversion
3. TFIDF Vectorization
4. etc

In Text processing we did not perform lowercase conversion, we did not remove stopwords.   
Without performing these steps we just vectorized and applied machine learning.   
Do you think that this is a wrong approach


- No approach in Data Science is a wrong appraoch 
### Why?
- The  data Science project building approach is a repetative process