In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("horror-train.csv")
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
data.iloc[0]

id                                                  id26305
text      This process, however, afforded me no means of...
author                                                  EAP
Name: 0, dtype: object

In [4]:
data["author"].value_counts()

author
EAP    7900
MWS    6044
HPL    5635
Name: count, dtype: int64

In [5]:
data['text'].iloc[0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [6]:
data.shape

(19579, 3)

# Problem Statement:
- You are provided with a text you need to tell which author has the highest probability of having written that text.
  

In [7]:
# Drop the id from the data
data = data.drop(columns = "id")
data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


# Text Processing

In [8]:
import nltk 
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"[A-Za-z]+", gaps = False)
tokenizer

RegexpTokenizer(pattern='[A-Za-z]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [9]:
## The regexp defined above
## REmoves all punctuations
## remvoes all digits
## removes all urls, etc

## It only retains the words from the text.
## The text is converted into tokens which usually we used to get using the word_tokenize function.


In [10]:
my_sent = "This is an #example of @regular expression tokenizer ! % # 2000."
my_sent

'This is an #example of @regular expression tokenizer ! % # 2000.'

In [11]:
" ".join(tokenizer.tokenize(my_sent))

'This is an example of regular expression tokenizer'

In [12]:
## Apply the regexptokenizer on the text column of the data

data["text_tokenized"] = data["text"].apply(lambda x : tokenizer.tokenize(x))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,"[This, process, however, afforded, me, no, mea..."
1,It never once occurred to me that the fumbling...,HPL,"[It, never, once, occurred, to, me, that, the,..."
2,"In his left hand was a gold snuff box, from wh...",EAP,"[In, his, left, hand, was, a, gold, snuff, box..."
3,How lovely is spring As we looked from Windsor...,MWS,"[How, lovely, is, spring, As, we, looked, from..."
4,"Finding nothing else, not even gold, the Super...",HPL,"[Finding, nothing, else, not, even, gold, the,..."


In [13]:
data["text_tokenized"] = data["text_tokenized"].apply(lambda l : " ".join(l))
data.head()

Unnamed: 0,text,author,text_tokenized
0,"This process, however, afforded me no means of...",EAP,This process however afforded me no means of a...
1,It never once occurred to me that the fumbling...,HPL,It never once occurred to me that the fumbling...
2,"In his left hand was a gold snuff box, from wh...",EAP,In his left hand was a gold snuff box from whi...
3,How lovely is spring As we looked from Windsor...,MWS,How lovely is spring As we looked from Windsor...
4,"Finding nothing else, not even gold, the Super...",HPL,Finding nothing else not even gold the Superin...


# Vectorization: CountVectorizer()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [15]:
vec.fit_transform(data["text_tokenized"])

<19579x25052 sparse matrix of type '<class 'numpy.int64'>'
	with 429665 stored elements in Compressed Sparse Row format>

In [16]:
vec.fit_transform(data["text_tokenized"]).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Seperate X and y

In [17]:
X = vec.fit_transform(data["text_tokenized"]).toarray()
y = data["author"]

# Split the data into train test sets

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Apply Naive Bayes on the data

In [19]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb

In [20]:
mnb.fit(X_train, y_train)

# performing predictions

In [21]:
y_pred = mnb.predict(X_test)
y_pred

array(['HPL', 'MWS', 'MWS', ..., 'MWS', 'EAP', 'EAP'], dtype='<U3')

# Checking Accuracy

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.839632277834525

# Observations:
- We get 84% accuracy that too with minimal text processing
- How can I improve the accuary   
  a. Detailed Text processing.   
  b. How well have you done your Step 3 : Data Preprocessing. ( Apply Decomposition techniques like PCA or LDA)
  c. Cross Validation   
  d. Hyperparameter tuning.   
  e. Perform Class Imbalance Treatment

- You can also apply Bernoulli's NB and check whether it performs better than this.

In [26]:
data["author"].value_counts(normalize = True) * 100

author
EAP    40.349354
MWS    30.869809
HPL    28.780837
Name: proportion, dtype: float64

- 82% accuracy is better than 93% accuracy