# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [2]:
import nltk
import string
data['clean_reviews'] = data['reviews']
y = data['target']

The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [3]:
def preprocess(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '').lower()
    return text

In [4]:
data.clean_reviews = data.clean_reviews.apply(preprocess)
data.head(15)

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party d...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastards quick movie review \ndamn t...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first fe...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing p...
5,neg,capsule : in 2176 on the planet mars police ta...,capsule in 2176 on the planet mars police tak...
6,neg,"so ask yourself what "" 8mm "" ( "" eight millime...",so ask yourself what 8mm eight millimeter ...
7,neg,that's exactly how long the movie felt to me ....,thats exactly how long the movie felt to me \...
8,neg,call it a road trip for the walking wounded . ...,call it a road trip for the walking wounded \...
9,neg,plot : a young french boy sees his parents kil...,plot a young french boy sees his parents kill...


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [6]:
# Create Pipeline
pipeline = Pipeline([
    ('cvect', CountVectorizer()),
    ('nb', MultinomialNB()),
])

In [7]:
pipeline.get_params()

{'memory': None,
 'steps': [('cvect', CountVectorizer()), ('nb', MultinomialNB())],
 'verbose': False,
 'cvect': CountVectorizer(),
 'nb': MultinomialNB(),
 'cvect__analyzer': 'word',
 'cvect__binary': False,
 'cvect__decode_error': 'strict',
 'cvect__dtype': numpy.int64,
 'cvect__encoding': 'utf-8',
 'cvect__input': 'content',
 'cvect__lowercase': True,
 'cvect__max_df': 1.0,
 'cvect__max_features': None,
 'cvect__min_df': 1,
 'cvect__ngram_range': (1, 1),
 'cvect__preprocessor': None,
 'cvect__stop_words': None,
 'cvect__strip_accents': None,
 'cvect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvect__tokenizer': None,
 'cvect__vocabulary': None,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True}

In [8]:
# Set parameters to search
parameters = {
    'cvect__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1, 1),}

# Perform grid search
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=-1, 
                           verbose=1, 
                           scoring = 'accuracy', 
                           refit=True, 
                           cv=5)

In [9]:
grid_search.fit(data.clean_reviews,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvect', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvect__ngram_range': ((1, 1), (2, 2)),
                         'nb__alpha': (0.1, 1)},
             scoring='accuracy', verbose=1)

In [10]:
grid_search.best_estimator_

Pipeline(steps=[('cvect', CountVectorizer(ngram_range=(2, 2))),
                ('nb', MultinomialNB(alpha=0.1))])

In [11]:
grid_search.best_estimator_.score(data.clean_reviews, y)

1.0

In [12]:
my_review = 'I hate this one, this too bad and rubissh ! This test is definitely shiit !!'
my_review = preprocess(my_review)

In [13]:
grid_search.best_estimator_.predict([my_review])[0]

'neg'

In [14]:
another_review = 'I definitely love it.'
grid_search.best_estimator_.predict([preprocess(another_review)])[0]

'pos'

⚠️ Please push the exercise once you are done 🙃

## 🏁 