In [1]:
import numpy as np
import pandas as pd

#for nlp
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

#text vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

#metrics
from sklearn.metrics import classification_report, accuracy_score

#import method releated to evaluation
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### load the text data "movie_data_cat"


In [2]:
#filename = 'movie_data_cat.csv'
#df = pd.read_csv(filename) 

# creating mini IMDB dataset

In [3]:

df = pd.read_csv("movie_data_cat.csv")
mn = np.random.rand(len(df)) < 0.1
df_mini_movie_data_cat = df[mn]

print(len(df_mini_movie_data_cat))

5030


In [4]:
#use index=false so pandas dont create a extra index column
df_mini_movie_data_cat.to_csv('mini_movie_data_cat.csv',index=False)

### load the text data "mini_movie_data_cat

In [5]:
filename = 'mini_movie_data_cat.csv'
df = pd.read_csv(filename) 

In [6]:
df.head(10)

Unnamed: 0,review,sentiment
0,"I recently bought the DVD, forgetting just how...",neg
1,I've been impressed with Chavez's stance again...,pos
2,"What else can you say about this movie,except ...",neg
3,The story is extremely unique.It's about these...,pos
4,Unlike some movies which you can wonder around...,pos
5,I just found the entire 3 DVD set at Wal-Mart ...,pos
6,It's not just that the movie is lame. It's mor...,neg
7,Eisenstein describes his collaboration with Pr...,pos
8,Being raised at the time this movie was releas...,pos
9,Going for something far away from the delibera...,pos


# Preprocessing

### Since the sentiment column happens to be categorical we can map the "pos" and "neg" classes to 0 and 1 integers

In [7]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['sentiment']))}

print(class_mapping)

class_labels = [x for x in class_mapping] 



{'neg': 0, 'pos': 1}


In [8]:
#use the mapping dictionary to transform the class labels into integers

df['sentiment'] = df['sentiment'].map(class_mapping)
df.head(10)

Unnamed: 0,review,sentiment
0,"I recently bought the DVD, forgetting just how...",0
1,I've been impressed with Chavez's stance again...,1
2,"What else can you say about this movie,except ...",0
3,The story is extremely unique.It's about these...,1
4,Unlike some movies which you can wonder around...,1
5,I just found the entire 3 DVD set at Wal-Mart ...,1
6,It's not just that the movie is lame. It's mor...,0
7,Eisenstein describes his collaboration with Pr...,1
8,Being raised at the time this movie was releas...,1
9,Going for something far away from the delibera...,1


### Exploreing various cell

In [9]:
df.loc[1200, 'review']#[-50:]



### Regular expressions to clean text

In [10]:
df.head(5)

Unnamed: 0,review,sentiment
0,"I recently bought the DVD, forgetting just how...",0
1,I've been impressed with Chavez's stance again...,1
2,"What else can you say about this movie,except ...",0
3,The story is extremely unique.It's about these...,1
4,Unlike some movies which you can wonder around...,1


In [11]:
#import regular expressions to clean up the text
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

### Apply the clean data preprocessor to the text

In [12]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

### apply the preprocessor to the entire dataframe (i.e. column review)

In [13]:
# apply the preprocessor to the entire dataframe (i.e. column review)
df['review'] = df['review'].apply(preprocessor)

In [14]:
df.head(5)

Unnamed: 0,review,sentiment
0,i recently bought the dvd forgetting just how ...,0
1,i ve been impressed with chavez s stance again...,1
2,what else can you say about this movie except ...,0
3,the story is extremely unique it s about these...,1
4,unlike some movies which you can wonder around...,1


In [15]:
# download the stopwords if not done before (need an Internet connection)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nadim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop = set(stopwords.words('english'))
def stop_removal(text):
       return [w for w in text if not w in stop]

In [17]:
text = "This is a sample sentence, demonstrating the removal of stop words."
stopped_text = stop_removal(text.split())
print(stopped_text) 

['This', 'sample', 'sentence,', 'demonstrating', 'removal', 'stop', 'words.']


### basic text pre-processing pipeline

In [18]:
#The basic pipeline includes stopword removal, tokenising and stemming
stop = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def tokenizer(text):
       return text.split()

def tokenizer_stemmer(text):
    return [stemmer.stem(word) for word in tokenizer(text)]#text.split()]


def stop_removal(text):
       return [w for w in text if not w in stop]

In [19]:
df.loc[180, 'review']

'after watching a dozen episodes i decided to give up on this show since it depicts in an unrealistic manner what is mathematical modeling in the episodes that charlie would predict the future behavior of individuals using mathematical models i thought that my profession was being joked about i am not a mathematician instead a chemical engineer but i do work a lot with mathematical models so i will try to explain to the layman why what is shown is close to make believe of fairy tales first choosing the right model to predict a situation is a demanding task charlie eppes is shown as a genius but even him would have to spend considerable time researching for a suitable model specifically for trying to guess what someone will do or where he will be in the near future individuals are erratic and haphazard there is no modeling for them isaac asimov even wrote about that in the 1950 s even if there were a model for specific kind of individual it would be a probabilistic stoichastic one meani

###  Vectorisation of text data
Next lets prepare the data using the CountVectorizer to parse the text data into a bag-of-words model.
Thereafter fit a sklearn calssifier.
First we start by creating a basic train_test_split to check that the data is trasnformed correctly before setting up a comparative study.

In [20]:
from sklearn.model_selection import train_test_split

X = df.loc[:, 'review'].values
y = df.loc[:, 'sentiment'].values

text_train, text_test, y_train, y_test = train_test_split(X, y, 
                                                          random_state=42,
                                                          test_size=0.25,
                                                          stratify=y)

### CountVectorizer

In [21]:
#This method converts the text into count vector or a binary vector
vectorizer = CountVectorizer()
vectorizer.fit(X) # Learn a vocabulary dictionary of all tokens in the raw documents.

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

In [22]:
print(X_train.shape)
print(X_test.shape)

(3772, 39482)
(1258, 39482)


In [23]:
print('Sice of the vocabulary or the number of features in the vector ', len(vectorizer.vocabulary_))

Sice of the vocabulary or the number of features in the vector  39482


In [24]:
print(vectorizer.get_feature_names()[2000:2020]) #Array mapping from feature integer indices to feature name

['apartments', 'apathetic', 'apathy', 'ape', 'apeman', 'apes', 'apex', 'aphasia', 'aphorism', 'aphorisms', 'aphrodite', 'aping', 'apke', 'aplenty', 'aplogise', 'aplomb', 'apocalypse', 'apocalyptic', 'apogee', 'apollo']


### Strip HTML and punctuation to speed up the GridSearch later

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

### smaller sample
X_train = df.loc[:2500, 'review'].values
y_train = df.loc[:2500, 'sentiment'].values

# Training  Classifiers on Text Features

# Training KNN Classifier on Text Features

In [25]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN =  KNeighborsClassifier()
clf_KNN

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [26]:
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [27]:
clf_KNN.score(X_test, y_test)

0.6049284578696343

# Training ANN MLP Classifier on Text Features

In [28]:
from sklearn.neural_network import MLPClassifier
clf_MLP =  MLPClassifier()
clf_MLP

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [29]:
clf_MLP.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [30]:
clf_MLP.score(X_test, y_test)

0.8688394276629571

# Training naive_bayes MultinomialNB Classifier on Text Features

In [31]:
from sklearn.naive_bayes import MultinomialNB
clf_NBM = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf_NBM


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
clf_NBM.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
clf_NBM.score(X_test, y_test)

0.8267090620031796

# Setting up a pipeline 
In the previous example we carried out several steps involving: count vectorising; tfidf transforming and then applying this to both the train and test before fitting a classifier for preditcion. 
This pipeline of trnasformation steps and the final prediction can be carried out by setting up a pipeline.
Instead of using the transformed vectors of X_train and Y_train ; we will use the original train and test which contained the text data i.e. text_train and text_test. These can then be sent through the transformation piepline steps. 

### KNeighborsClassifier

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    KNeighborsClassifier())

pipeline.fit(text_train, y_train)
y_pred = pipeline.predict(text_test)
print('accuracy %s' % pipeline.score(text_test, y_test))
print(classification_report(y_test, y_pred,target_names=class_labels))

accuracy 0.6891891891891891
             precision    recall  f1-score   support

        neg       0.67      0.72      0.70       621
        pos       0.71      0.66      0.68       637

avg / total       0.69      0.69      0.69      1258



### Neural_network MLPClassifier

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    MLPClassifier())

pipeline.fit(text_train, y_train)
y_pred = pipeline.predict(text_test)
print('accuracy %s' % pipeline.score(text_test, y_test))
print(classification_report(y_test, y_pred,target_names=class_labels))

accuracy 0.8656597774244833
             precision    recall  f1-score   support

        neg       0.87      0.86      0.86       621
        pos       0.86      0.87      0.87       637

avg / total       0.87      0.87      0.87      1258



### Naive_bayes MultinomialNB

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))


pipeline.fit(text_train, y_train)
y_pred = pipeline.predict(text_test)
print('accuracy %s' % pipeline.score(text_test, y_test))
print(classification_report(y_test, y_pred,target_names=class_labels))

accuracy 0.8537360890302067
             precision    recall  f1-score   support

        neg       0.83      0.88      0.86       621
        pos       0.88      0.83      0.85       637

avg / total       0.85      0.85      0.85      1258



# Setting up a gridsearch with Cross Validation
now we will set u the grid search for there chosen classifiers.

# Grid Search for KNN 

### Search for the best params for optimal K value For KNN

In [37]:
print(KNeighborsClassifier().get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 1, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [38]:
param_grid = [
        {
        'n_neighbors':  list(range(3,15))
        }
       ]

gs = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring='accuracy')
gs.fit(X_train,y_train)
y_pred = gs.predict(X_test)

clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

Test accuracy: 0.616


In [39]:
print(gs.best_params_ )

{'n_neighbors': 12}


### Setting up tfidf Grid search for  KNN with optimal params value

In [45]:
param_grid = [{'tfidfvectorizer__ngram_range': [(1, 1)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'tfidfvectorizer__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__max_features': [1000, 4000], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__tokenizer': [tokenizer_stemmer]}, # use a tokeniser and the stemmer 
               ]

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        #max_features=4000,
                        min_df=7,
                        preprocessor=None)

pipeline = make_pipeline(TfidfVectorizer(strip_accents=None, lowercase=False, min_df=7, preprocessor=None), 
                         KNeighborsClassifier(n_neighbors=12))

Knn_tfidf = GridSearchCV(pipeline, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=1) 

In [46]:
#we split dataset into 2 parts, to form the test and training. 
#This will ensure that the cross validation takes place on the training data and final accuracy on the test.
text_train, text_test, y_train, y_test = train_test_split(X, y, 
                                                          random_state=42,
                                                          test_size=0.25,
                                                          stratify=y)
Knn_tfidf.fit(text_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  8.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=7,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_...wski',
           metric_params=None, n_jobs=1, n_neighbors=12, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'tfidfvectorizer__ngram_range': [(1, 1)], 'tfidfvectorizer__stop_words': [{'an', "shan't", 'hasn', 'who', 'by', 'its', 'y', 'am', "don't", 'only', 'them', "mightn't", 'about', 'are', 'so', 'those', 'didn', 'yourself', 'does', 'doing', 'too', 'shouldn', 'some', 'to', "hadn't", 'me', 'won...: [1000, 4000], 'tfidfvectorizer__tokenizer': [<function tokenizer_stemmer at 0x000001C50749B400>]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

### Best tfidf params for KNN

In [47]:
print (Knn_tfidf.best_params_ )

{'tfidfvectorizer__max_features': 4000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': {'an', "shan't", 'hasn', 'who', 'by', 'its', 'y', 'am', "don't", 'only', 'them', "mightn't", 'about', 'are', 'so', 'those', 'didn', 'yourself', 'does', 'doing', 'too', 'shouldn', 'some', 'to', "hadn't", 'me', 'won', 't', 'ourselves', 'whom', 'd', 'not', 'the', 'for', 'll', 'mightn', 'themselves', "you'd", "weren't", 'we', 'what', 'down', 'm', 'is', 'ma', 'you', 'shan', 'or', 'through', 'weren', 'these', "doesn't", 'both', "you'll", 'she', 'he', 'on', "you're", 'itself', 'against', 'which', 'was', 'this', 'herself', "couldn't", "didn't", "mustn't", 'at', 'having', 'no', 'did', 'should', 'most', 'hadn', 'o', 'between', "wasn't", "wouldn't", 'i', 'any', "won't", 'where', 'now', 'wasn', 'and', 'above', 'couldn', 'her', 've', 'under', "that'll", 'were', 'if', "hasn't", 'here', 'yourselves', 'such', 'further', 'other', 'wouldn', 'our', 'ours', 'him', 'out', "aren't", "haven't", "you

### Best esimator for KNN

In [48]:
print ('Best Score :',Knn_tfidf.best_score_ )

clf_KNN = Knn_tfidf.best_estimator_
clf_KNN.fit(text_test,y_test)
print('Test accuracy: %.3f' % clf_KNN.score(text_test, y_test))

print('Best Estimator : ',Knn_tfidf.best_estimator_.score(text_test, y_test))

Best Score : 0.7277306468716861
Test accuracy: 0.812
Best Estimator :  0.8116057233704292


### Classification report for KNN

In [49]:
from sklearn.metrics import classification_report
y_true=y_test 
y_pred=y_pred
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.63      0.54      0.58       621
          1       0.61      0.69      0.65       637

avg / total       0.62      0.62      0.61      1258



# Grid search for ANN MLP Classifier

### Find the optimal best params for ANN MLP Classifier

In [50]:
param_grid = [
        {
            'activation' : ['identity', 'logistic', 'relu'], 
            'solver' : ['lbfgs', 'adam'],
            'hidden_layer_sizes': [(5,),(10,),(15,)] # a single hidden layer
        }
       ]

ANN_gs = GridSearchCV(MLPClassifier(), param_grid, cv=10, scoring='accuracy')
ANN_gs.fit(X_train,y_train)
y_pred = ANN_gs.predict(X_test)

clf = ANN_gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))



Test accuracy: 0.870


In [51]:
print(ANN_gs.best_params_ )

{'activation': 'identity', 'hidden_layer_sizes': (5,), 'solver': 'adam'}


### setting up the  Grid search for ANN MLPclssifier with the best params

In [52]:
from sklearn.metrics import precision_score, recall_score, f1_score

#pipe.fit(X_train, y_train)
#y_pred = pipe.predict(X_test)

param_grid = [{'tfidfvectorizer__ngram_range': [(1, 1)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'tfidfvectorizer__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__max_features': [1000, 4000], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__tokenizer': [tokenizer_stemmer],}, # use a tokeniser and the stemmer 
               ]



tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        #max_features=4000,
                        min_df=7,
                        preprocessor=None)

pipeline = make_pipeline(TfidfVectorizer(strip_accents=None, lowercase=False, min_df=7, preprocessor=None), 
                         MLPClassifier(solver='adam',hidden_layer_sizes=(5,),activation='identity'))




Ann_tfidf = GridSearchCV(pipeline, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=1
                       ) 


In [53]:
#we split dataset into 2 parts, to form the test and training. 
#This will ensure that the cross validation takes place on the training data and final accuracy on the test.
text_train, text_test, y_train, y_test = train_test_split(X, y, 
                                                          random_state=42,
                                                          test_size=0.25,
                                                          stratify=y)
Ann_tfidf.fit(text_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 10.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=7,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'tfidfvectorizer__ngram_range': [(1, 1)], 'tfidfvectorizer__stop_words': [{'an', "shan't", 'hasn', 'who', 'by', 'its', 'y', 'am', "don't", 'only', 'them', "mightn't", 'about', 'are', 'so', 'those', 'didn', 'yourself', 'does', 'doing', 'too', 'shouldn', 'some', 'to', "hadn't", 'me', 'won...: [1000, 4000], 'tfidfvectorizer__tokenizer': [<function tokenizer_stemmer at 0x000001C50749B400>]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

### Best tfidf params for ANN MLP

In [54]:
print(Ann_tfidf.best_params_ )

{'tfidfvectorizer__max_features': 4000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None, 'tfidfvectorizer__tokenizer': <function tokenizer_stemmer at 0x000001C50749B400>}


### best esimator for ANN MLP

In [55]:
print ('Best Score :',Ann_tfidf.best_score_ )

clf_Ann = Ann_tfidf.best_estimator_

clf_Ann.fit(text_test,y_test)
print('Test accuracy: %.3f' % clf_Ann.score(text_test, y_test))

print('Best Estimator : ',Ann_tfidf.best_estimator_.score(text_test, y_test))

Best Score : 0.8356309650053022




Test accuracy: 1.000
Best Estimator :  1.0


### Classification report for ANN MLP

In [56]:
from sklearn.metrics import classification_report
y_true=y_test 
y_pred=y_pred
print(classification_report(y_true, y_pred))


             precision    recall  f1-score   support

          0       0.87      0.88      0.87       621
          1       0.88      0.87      0.87       637

avg / total       0.87      0.87      0.87      1258



# Grid search for Naive Bayes MB

### Finding optimal param values for Naive Bayes MB

In [57]:
print(MultinomialNB().get_params())

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}


In [58]:
param_grid = [
        {
            'alpha': (1,2,3,4,5),
            'fit_prior': (True, False),
            'class_prior':[(1, 1), (1, 2)],
          
        }
       ]

NBM_gs = GridSearchCV(MultinomialNB(), param_grid, cv=10, scoring='accuracy')
NBM_gs.fit(X_train,y_train)
y_pred = NBM_gs.predict(X_test)

clf = NBM_gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

Test accuracy: 0.836


In [59]:
print(NBM_gs.best_params_ )

{'alpha': 5, 'class_prior': (1, 2), 'fit_prior': True}


### setting up Grid search for Naive Bayes MB with optimal values

In [60]:
param_grid = [{'tfidfvectorizer__ngram_range': [(1, 1)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'tfidfvectorizer__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__max_features': [1000, 4000], # use the stop dictionary of stopwords or not
               'tfidfvectorizer__tokenizer': [tokenizer_stemmer]}, # use a tokeniser and the stemmer 
               ]

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        #max_features=4000,
                        min_df=7,
                        preprocessor=None)

pipeline = make_pipeline(TfidfVectorizer(strip_accents=None, lowercase=False, min_df=7, preprocessor=None), 
                         MultinomialNB(alpha=5.0, class_prior=(1,2), fit_prior=True))

NB_tfidf = GridSearchCV(pipeline, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=1) 

In [61]:
#we split dataset into 2 parts, to form the test and training. 
#This will ensure that the cross validation takes place on the training data and final accuracy on the test.
text_train, text_test, y_train, y_test = train_test_split(X, y, 
                                                          random_state=42,
                                                          test_size=0.25,
                                                          stratify=y)
NB_tfidf.fit(text_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  8.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=7,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_... vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=5.0, class_prior=(1, 2), fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'tfidfvectorizer__ngram_range': [(1, 1)], 'tfidfvectorizer__stop_words': [{'an', "shan't", 'hasn', 'who', 'by', 'its', 'y', 'am', "don't", 'only', 'them', "mightn't", 'about', 'are', 'so', 'those', 'didn', 'yourself', 'does', 'doing', 'too', 'shouldn', 'some', 'to', "hadn't", 'me', 'won...: [1000, 4000], 'tfidfvectorizer__tokenizer': [<function tokenizer_stemmer at 0x000001C50749B400>]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

### Best params for Naive Bayes MB

In [62]:
print (NB_tfidf.best_params_ )

{'tfidfvectorizer__max_features': 1000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': {'an', "shan't", 'hasn', 'who', 'by', 'its', 'y', 'am', "don't", 'only', 'them', "mightn't", 'about', 'are', 'so', 'those', 'didn', 'yourself', 'does', 'doing', 'too', 'shouldn', 'some', 'to', "hadn't", 'me', 'won', 't', 'ourselves', 'whom', 'd', 'not', 'the', 'for', 'll', 'mightn', 'themselves', "you'd", "weren't", 'we', 'what', 'down', 'm', 'is', 'ma', 'you', 'shan', 'or', 'through', 'weren', 'these', "doesn't", 'both', "you'll", 'she', 'he', 'on', "you're", 'itself', 'against', 'which', 'was', 'this', 'herself', "couldn't", "didn't", "mustn't", 'at', 'having', 'no', 'did', 'should', 'most', 'hadn', 'o', 'between', "wasn't", "wouldn't", 'i', 'any', "won't", 'where', 'now', 'wasn', 'and', 'above', 'couldn', 'her', 've', 'under', "that'll", 'were', 'if', "hasn't", 'here', 'yourselves', 'such', 'further', 'other', 'wouldn', 'our', 'ours', 'him', 'out', "aren't", "haven't", "you

### best esimator for Naive Bayes MB

In [63]:
print ('Best Score :',NB_tfidf.best_score_ )

clf_NB = NB_tfidf.best_estimator_
clf_NB.fit(text_test,y_test)
print('Test accuracy: %.5f' % clf_NB.score(text_test, y_test))

print('Best Estimator : ',NB_tfidf.best_estimator_.score(text_test, y_test))

Best Score : 0.6540296924708378
Test accuracy: 0.63514
Best Estimator :  0.6351351351351351


### Classification report for Naive Bayes MB

In [64]:
from sklearn.metrics import classification_report
y_true=y_test 
y_pred=y_pred
print(classification_report(y_true,y_pred))

             precision    recall  f1-score   support

          0       0.83      0.84      0.83       621
          1       0.84      0.83      0.84       637

avg / total       0.84      0.84      0.84      1258

