# Movie Reviews

In [12]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head(15)

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...
5,neg,capsule : in 2176 on the planet mars police ta...
6,neg,"so ask yourself what "" 8mm "" ( "" eight millime..."
7,neg,that's exactly how long the movie felt to me ....
8,neg,call it a road trip for the walking wounded . ...
9,neg,plot : a young french boy sees his parents kil...


In [13]:
data.shape

(2000, 2)

The dataset is made up of positive and negative movie reviews.

In [14]:
data.nunique()

target        2
reviews    2000
dtype: int64

## Preprocessing

👇 Remove punctuation and lower case the text.

In [15]:
import nltk
import string
data['clean_reviews'] = data['reviews']

In [16]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def apply_lower_case(text):
    return text.lower()

In [17]:
data.clean_reviews = data.clean_reviews.apply(remove_punctuation)
data.head(15)

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party d...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastards quick movie review \ndamn t...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first fe...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing p...
5,neg,capsule : in 2176 on the planet mars police ta...,capsule in 2176 on the planet mars police tak...
6,neg,"so ask yourself what "" 8mm "" ( "" eight millime...",so ask yourself what 8mm eight millimeter ...
7,neg,that's exactly how long the movie felt to me ....,thats exactly how long the movie felt to me \...
8,neg,call it a road trip for the walking wounded . ...,call it a road trip for the walking wounded \...
9,neg,plot : a young french boy sees his parents kil...,plot a young french boy sees his parents kill...


In [18]:
data.clean_reviews = data.clean_reviews.apply(apply_lower_case)
data.head(15)

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party d...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastards quick movie review \ndamn t...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first fe...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing p...
5,neg,capsule : in 2176 on the planet mars police ta...,capsule in 2176 on the planet mars police tak...
6,neg,"so ask yourself what "" 8mm "" ( "" eight millime...",so ask yourself what 8mm eight millimeter ...
7,neg,that's exactly how long the movie felt to me ....,thats exactly how long the movie felt to me \...
8,neg,call it a road trip for the walking wounded . ...,call it a road trip for the walking wounded \...
9,neg,plot : a young french boy sees his parents kil...,plot a young french boy sees his parents kill...


## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [22]:
X_bow = vectorizer.fit_transform(data.clean_reviews)

X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
pd.DataFrame(X_bow.toarray(),columns = vectorizer.get_feature_names())

Unnamed: 0,00,000,0009f,000acre,000aweek,000foot,000paltry,007,007esque,00s,...,zuko,zukovsky,zulu,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
y = data.target

In [26]:
nb_model.fit(X_bow, y)
nb_model.score(X_bow, y)

0.974

In [27]:
cross_val_score(nb_model, X_bow, y, scoring='accuracy', cv=5).mean()

0.8145

In [62]:
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('c_vect', CountVectorizer()),
    ('nb', MultinomialNB()),
])

In [63]:
pipeline.fit(data.clean_reviews, y)

Pipeline(steps=[('c_vect', CountVectorizer()), ('nb', MultinomialNB())])

In [64]:
my_text = 'I hate this one, this too bad and rubissh ! This test is definitely shiit !!'
my_text = remove_punctuation(my_text)
my_text = apply_lower_case(my_text)
#my_text = pd.DataFrame(data=[my_text])

In [65]:
my_text

'i hate this one this too bad and rubissh  this test is definitely shiit '

In [66]:
pipeline.predict([my_text])[0]

'neg'

## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [28]:
vectorizer_2g = CountVectorizer(ngram_range=(2,2))

X_bow_2g = vectorizer_2g.fit_transform(data.clean_reviews)
X_bow_2g.toarray()

pd.DataFrame(X_bow.toarray(),columns = vectorizer.get_feature_names())

Unnamed: 0,00,000,0009f,000acre,000aweek,000foot,000paltry,007,007esque,00s,...,zuko,zukovsky,zulu,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
nb_model = MultinomialNB()
nb_model.fit(X_bow_2g, y)
nb_model.score(X_bow_2g, y)

1.0

In [68]:
cross_val_score(nb_model, X_bow_2g, y, scoring='accuracy', cv=5).mean()

0.8365

In [69]:
# Create Pipeline
pipeline_2g = Pipeline([
    ('c_vect2g', CountVectorizer(ngram_range=(2,2))),
    ('nb', MultinomialNB()),
])

In [70]:
pipeline_2g.fit(data.clean_reviews, y)
pipeline_2g.predict([my_text])[0]

'neg'

⚠️ Please push the exercise once you are done 🙃

## 🏁 