# Vectorizer + NaiveBayes Tuning

🎯 The goal of this challenge is to create a Pipeline combining a Vectorizer + a NaiveBayes algorithm and to fine-tune the pipeline.

✍️ Let's reuse the previous dataset with $2000$ reviews classified either as "positive" or "negative".

In [21]:
from sklearn import preprocessing
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV

In [1]:

data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/10-Natural-Language-Processing/movie_reviews.csv")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [3]:
le = preprocessing.LabelEncoder()
data["target_encoded"] =  le.fit_transform(data.target)

In [4]:
data.head()

Unnamed: 0,target,reviews,target_encoded
0,neg,"plot : two teen couples go to a church party ,...",0
1,neg,the happy bastard's quick movie review \ndamn ...,0
2,neg,it is movies like these that make a jaded movi...,0
3,neg,""" quest for camelot "" is warner bros . ' firs...",0
4,neg,synopsis : a mentally unstable man undergoing ...,0


## Preprocessing

❓ **Question (Cleaning)** ❓

Clean your texts

In [6]:
def make_lower_case(sentence):
    return sentence.lower()


def remove_punctuation(sentence):
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, "")
    return sentence


def remove_numbers(sentence):
    pattern = r"[0-9]"
    return re.sub(pattern, "", sentence)


def tokenize(sentence):
    return word_tokenize(sentence)

def remove_whitespance(tokenized_sentence):
    return [word for word in tokenized_sentence if word]


def remove_stopwords(tokenized_sentence):
    sw = set(stopwords.words("english"))
    return [word for word in tokenized_sentence if word not in sw]


def lemmatize(tokenized_sentence):
    lemmatizer = WordNetLemmatizer()
    v_lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in tokenized_sentence]

    v_n_lemmatized = [lemmatizer.lemmatize(word, pos="n") for word in v_lemmatized]

    v_n_a_lemmatized = [lemmatizer.lemmatize(word, pos="a") for word in v_n_lemmatized]

    v_n_a_r_lemmatized = [
        lemmatizer.lemmatize(word, pos="r") for word in v_n_a_lemmatized
    ]

    return v_n_a_r_lemmatized


def preprocessing(sentence, remove_stopwords=True):
    sentence = make_lower_case(sentence)
    sentence = remove_numbers(sentence)
    sentence = remove_punctuation(sentence)
    tokenized_sentence = tokenize(sentence)
    if remove_stopwords:
        tokenized_sentence = remove_stopwords(tokenized_sentence)
    tokenized_sentence = remove_whitespance(tokenized_sentence)
    tokenized_sentence = lemmatize(tokenized_sentence)
    sentence = " ".join(tokenized_sentence)
    return sentence

In [7]:
data["clean_reviews"] = data["reviews"].apply(preprocessing, remove_stopwords=False)

In [8]:
data.head()

Unnamed: 0,target,reviews,target_encoded,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",0,plot two teen couple go to a church party drin...
1,neg,the happy bastard's quick movie review \ndamn ...,0,the happy bastard quick movie review damn that...
2,neg,it is movies like these that make a jaded movi...,0,it be movie like these that make a jade movie ...
3,neg,""" quest for camelot "" is warner bros . ' firs...",0,quest for camelot be warner bros first feature...
4,neg,synopsis : a mentally unstable man undergoing ...,0,synopsis a mentally unstable man undergo psych...


## Tuning

❓ **Question (Pipelining a Vectorizer and a NaiveBayes Model)** ❓

* Create a Pipeline that chains a vectorizer of your choice with a NaiveBayes model
* Optimize it
* What is your best estimator ?

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import set_config; set_config("diagram")

# Create Pipeline
pipeline = Pipeline([
    ("vectorization", TfidfVectorizer()),
    ("model", MultinomialNB()),
])


pipeline

In [14]:
pipeline.get_params()

{'memory': None,
 'steps': [('vectorization', TfidfVectorizer()), ('model', MultinomialNB())],
 'verbose': False,
 'vectorization': TfidfVectorizer(),
 'model': MultinomialNB(),
 'vectorization__analyzer': 'word',
 'vectorization__binary': False,
 'vectorization__decode_error': 'strict',
 'vectorization__dtype': numpy.float64,
 'vectorization__encoding': 'utf-8',
 'vectorization__input': 'content',
 'vectorization__lowercase': True,
 'vectorization__max_df': 1.0,
 'vectorization__max_features': None,
 'vectorization__min_df': 1,
 'vectorization__ngram_range': (1, 1),
 'vectorization__norm': 'l2',
 'vectorization__preprocessor': None,
 'vectorization__smooth_idf': True,
 'vectorization__stop_words': None,
 'vectorization__strip_accents': None,
 'vectorization__sublinear_tf': False,
 'vectorization__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vectorization__tokenizer': None,
 'vectorization__use_idf': True,
 'vectorization__vocabulary': None,
 'model__alpha': 1.0,
 'model__class_prior': None,

In [49]:
sw_eng = stopwords.words("english")
# Set parameters to search
params = {
    "vectorization__ngram_range":[(1, 2)],
    "vectorization__stop_words":[None, sw_eng],
    "vectorization__max_df": [0.95, 0.99],
    "vectorization__min_df": [0.01],
    "model__alpha": [0.1,0.5,1]
    }

In [50]:
# Perform grid search on pipeline
grid_search = GridSearchCV(
    pipeline,
    params,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=2,
    error_score='raise'
)


grid_search.fit(data["clean_reviews"], data["target_encoded"])

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[CV] END model__alpha=0.1, vectorization__max_df=0.95, vectorization__min_df=0.01, vectorization__ngram_range=(1, 2), vectorization__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',

🏁 Congratulations! You've managed to chain a Vectorizer and a NLP model and fine-tuned it!

💾 Don't forget to `git add/commit/push` your notebook...

🚀 ... and move on to the next challenge!