## Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
data = pd.read_csv('horror-train.csv')
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
data.shape

(19579, 3)

## id is no use hence drop id

In [4]:
data = data.drop('id', axis = 1)
data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
data['author'].value_counts(normalize = True) * 100

EAP    40.349354
MWS    30.869809
HPL    28.780837
Name: author, dtype: float64

## Text Processing

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
tokenizer

RegexpTokenizer(pattern='[A-Za-z]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [8]:
## accessing the first row from the data
data['text'][0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [9]:
tokenizer.tokenize(data['text'][0])

['This',
 'process',
 'however',
 'afforded',
 'me',
 'no',
 'means',
 'of',
 'ascertaining',
 'the',
 'dimensions',
 'of',
 'my',
 'dungeon',
 'as',
 'I',
 'might',
 'make',
 'its',
 'circuit',
 'and',
 'return',
 'to',
 'the',
 'point',
 'whence',
 'I',
 'set',
 'out',
 'without',
 'being',
 'aware',
 'of',
 'the',
 'fact',
 'so',
 'perfectly',
 'uniform',
 'seemed',
 'the',
 'wall']

In [10]:
sent = 'This is an example @ regextokenizer ! #.'
sent

'This is an example @ regextokenizer ! #.'

In [11]:
tokenizer.tokenize(sent)

['This', 'is', 'an', 'example', 'regextokenizer']

In [12]:
data['text_tokenized'] = data['text'].apply(lambda t : tokenizer.tokenize(t))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[This, process, however, afforded, me, no, mea..."
1,It never once occurred to me that the fumbling...,HPL,"[It, never, once, occurred, to, me, that, the,..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[In, his, left, hand, was, a, gold, snuff, box..."
3,How lovely is spring As we looked from Windsor...,MWS,"[How, lovely, is, spring, As, we, looked, from..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[Finding, nothing, else, not, even, gold, the,..."


## Stemming 

In [13]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stemmer

<PorterStemmer>

In [14]:
data['text_tokenized'] = data['text_tokenized'].apply(lambda l : [stemmer.stem(word) for word in l])
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[thi, process, howev, afford, me, no, mean, of..."
1,It never once occurred to me that the fumbling...,HPL,"[it, never, onc, occur, to, me, that, the, fum..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[in, hi, left, hand, wa, a, gold, snuff, box, ..."
3,How lovely is spring As we looked from Windsor...,MWS,"[how, love, is, spring, as, we, look, from, wi..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, not, even, gold, the, superi..."


## Convert the text_tokenized column into sentences

In [15]:
data['text_tokenized'] = data['text_tokenized'].apply(lambda l : " ".join(l))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,thi process howev afford me no mean of ascerta...
1,It never once occurred to me that the fumbling...,HPL,it never onc occur to me that the fumbl might ...
2,"In his left hand was a gold snuff box, from wh...",EAP,in hi left hand wa a gold snuff box from which...
3,How lovely is spring As we looked from Windsor...,MWS,how love is spring as we look from windsor ter...
4,"Finding nothing else, not even gold, the Super...",HPL,find noth els not even gold the superintend ab...


## Vectorization - CountVectorizer()

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [19]:
vec.fit_transform(data['text_tokenized']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Save the Vectorization result as X 

In [20]:
X = vec.fit_transform(data['text_tokenized']).toarray()
y = data['author']

## Split the data into train test split

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Apply Multinominal NB on X_train and y_train

In [23]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb

In [24]:
mnb.fit(X_train, y_train)

## Perform predictions

In [25]:
y_pred = mnb.predict(X_test)
y_pred

array(['HPL', 'MWS', 'MWS', ..., 'MWS', 'EAP', 'MWS'], dtype='<U3')

## Model Evaluation

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.8345250255362615

## Observations:

- With very minimal processing we are getting 83% accuracy
- In order to imporve the accuracy, perfrom detailed processing and you might see change in accuracy

## Note:

- Data Science project building is a iterative process. Hence, you have to repeat data/text processing steps again and again