## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
data = pd.read_csv('horror-train.csv')
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
data.shape

(19579, 3)

In [4]:
## id column no use hence drop

data = data.drop('id', axis = 1)
data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
data['text'].iloc[0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [6]:
data['author'].unique()

array(['EAP', 'HPL', 'MWS'], dtype=object)

## Problem Statement: 

Given that you are provided with a text you need to tell which author has the highest probability of having wirtten that text

## Text Processing

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+', gaps = False)
tokenizer

RegexpTokenizer(pattern='[A-Za-z]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [8]:
## All punctuations
## Digits
## Stop_words
## etc


##  r'[A-Za-z]+' - using this I try to retain only the words from the text deleting other stuffs like punctuations
## digits, emojis, special characters, etc

In [9]:
ex_sent = 'This is an example @ regexptokenizer ! #  2000.'
ex_sent

'This is an example @ regexptokenizer ! #  2000.'

In [10]:
tokenizer.tokenize(ex_sent)

['This', 'is', 'an', 'example', 'regexptokenizer']

## Apply the same regex tokenizer on the enitre text column

In [11]:
data['text_tokenized'] = data['text'].apply(lambda t : tokenizer.tokenize(t))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[This, process, however, afforded, me, no, mea..."
1,It never once occurred to me that the fumbling...,HPL,"[It, never, once, occurred, to, me, that, the,..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[In, his, left, hand, was, a, gold, snuff, box..."
3,How lovely is spring As we looked from Windsor...,MWS,"[How, lovely, is, spring, As, we, looked, from..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[Finding, nothing, else, not, even, gold, the,..."


#### Stemming - Extract root forms of the words

In [12]:
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
stem

<PorterStemmer>

In [13]:
data['text_tokenized'] = data['text_tokenized'].apply(lambda l : [stem.stem(word) for word in l])
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[thi, process, howev, afford, me, no, mean, of..."
1,It never once occurred to me that the fumbling...,HPL,"[it, never, onc, occur, to, me, that, the, fum..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[in, hi, left, hand, wa, a, gold, snuff, box, ..."
3,How lovely is spring As we looked from Windsor...,MWS,"[how, love, is, spring, as, we, look, from, wi..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, not, even, gold, the, superi..."


## Instead of applying Stemming apply Lemmatization

## Coverting the text_tokenized column into sentences

In [14]:
data['text_tokenized'] = data['text_tokenized'].apply(lambda l : " ".join(l))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,thi process howev afford me no mean of ascerta...
1,It never once occurred to me that the fumbling...,HPL,it never onc occur to me that the fumbl might ...
2,"In his left hand was a gold snuff box, from wh...",EAP,in hi left hand wa a gold snuff box from which...
3,How lovely is spring As we looked from Windsor...,MWS,how love is spring as we look from windsor ter...
4,"Finding nothing else, not even gold, the Super...",HPL,find noth els not even gold the superintend ab...


## Vectorization - countVectorizer()

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [16]:
vec.fit_transform(data['text_tokenized'])

<19579x15541 sparse matrix of type '<class 'numpy.int64'>'
	with 428322 stored elements in Compressed Sparse Row format>

In [17]:
vec.fit_transform(data['text_tokenized']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Seperate the data into X and y

In [18]:
X = vec.fit_transform(data['text_tokenized']).toarray()
y = data['author']

## Split the data into train set and test set

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Apply Naive Bayes on the data

In [20]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb

In [21]:
mnb.fit(X_train, y_train)

## Perform Predictions

In [22]:
y_pred = mnb.predict(X_test)
y_pred

array(['HPL', 'MWS', 'MWS', ..., 'MWS', 'EAP', 'MWS'], dtype='<U3')

In [23]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Check accuracy

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.8345250255362615

## Observations:

1. We get 83.5% accuracy with minimal text processing
2. Can achieve better accuracy with stops_words removal plus perform lemmatization instead of Stemming(Better text processing techniques)
3. Naive Bayes works better on text data even with minimal processing

In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(mnb ,X, y, cv = 5)   ## performing 5 Fold of train and test set

array([0.83426966, 0.83452503, 0.83273749, 0.83886619, 0.82835249])

##### It seems that 83.9% is the highest accuracy we can obtain using 5 fold cross validation

## Note: 

Data Science project buidling is an iterative process. You can revisit of the above steps any number of times.
CRISP - DM methodology - Cross Industry Standard process for Data Mining

In [26]:
##Step 1 : Load the libraries and Define Problem
## Step 2 : Data collection and Data loading 
## Step 3 : Pre Processing
## Step 4 : Seperate X and y
## Step 5 : Split the data into train and test set
## Step 6 : Fit the model
## Step 7 : Perform Prediction 
## Step 8 : Perform Evaluation