In [None]:
# Building a Moview Review Classification Model

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import movie_reviews
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
import random
from nltk.corpus import stopwords
nltk.download('movie_reviews')
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
sentiment_list = []
for s in movie_reviews.categories():
    sentiment_list.append(s)

In [None]:
# Prepare a dataframe
data = []

for s in sentiment_list:
    for file in movie_reviews.fileids(categories=s):
            data.append({"mov": movie_reviews.raw(file) , "lbl":s})

random.shuffle(data)

mov_data = pd.DataFrame(data)

In [None]:
mov_data.head()

Unnamed: 0,mov,lbl
0,what happens when you put martin lawrence in a...,neg
1,"take two old and dying men , a lifetime of reg...",neg
2,spoiled rich kid kelley morse ( chris klein ) ...,neg
3,jackie chan kicks his way into van damme terri...,neg
4,""" i'd rather die today and go to heaven than ...",pos


In [None]:
# Feature Extraction
nltk.download('punkt')
total_toks = []
for reviews in mov_data["mov"]:
    tokens = word_tokenize(reviews)
    total_toks.extend(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
tok_freq = FreqDist(total_toks)
# print(tok_freq.most_common(15))
word_features = list(tok_freq.keys())[:3000]
stop_words = set(stopwords.words('english'))
final_features = []
for w in word_features:
     if w not in stop_words:
          final_features.append(w)

In [None]:
word_features_df = pd.DataFrame(columns=final_features)


In [None]:
word_features_df.head()

Unnamed: 0,happens,put,martin,lawrence,fat,suit,real,life,?,get,...,displayed,gift,mix,fashion,tongue-in-cheek,futuristic,military,recruitment,ad,downhill


In [None]:
df = pd.concat([mov_data, word_features_df],axis=1)

In [None]:
df.head()

Unnamed: 0,mov,lbl,happens,put,martin,lawrence,fat,suit,real,life,...,displayed,gift,mix,fashion,tongue-in-cheek,futuristic,military,recruitment,ad,downhill
0,what happens when you put martin lawrence in a...,neg,,,,,,,,,...,,,,,,,,,,
1,"take two old and dying men , a lifetime of reg...",neg,,,,,,,,,...,,,,,,,,,,
2,spoiled rich kid kelley morse ( chris klein ) ...,neg,,,,,,,,,...,,,,,,,,,,
3,jackie chan kicks his way into van damme terri...,neg,,,,,,,,,...,,,,,,,,,,
4,""" i'd rather die today and go to heaven than ...",pos,,,,,,,,,...,,,,,,,,,,


In [None]:
df.loc[2]

mov            spoiled rich kid kelley morse ( chris klein ) ...
lbl                                                          neg
happens                                                      NaN
put                                                          NaN
martin                                                       NaN
                                     ...                        
futuristic                                                   NaN
military                                                     NaN
recruitment                                                  NaN
ad                                                           NaN
downhill                                                     NaN
Name: 2, Length: 2883, dtype: object

In [None]:
df.mov[2]

'spoiled rich kid kelley morse ( chris klein ) receives a new mercedes for a graduation present . \nhe and his buddies take it for a joyride to a small nearby town , where he proceeds to torment the locals simply because he\'s rich and they\'re not . \nhe ends up provoking jasper ( josh hartnett ) into a race and as a result , the local gas station and diner are destroyed when they crash into it . \nkelley is sentenced to rebuild the diner , and has to live with jasper in a spare room over his family\'s barn . \njasper\'s girlfriend sam ( leelee sobieski ) soon takes a liking to kelley however , despite the fact that 1 ) it was her family\'s diner that was destroyed and b ) all kelley does is sit around , sulk , and smart off to the townspeople . \nbut when she sees him sweaty and shirtless , that\'s apparently all she needs to lose her feelings for jasper ( whose only fault seems to be a perpetual case of hat hair ) . \nso sobieski soon falls in love with kelley , especially after the

In [None]:
def extract_features(df):
  for index, rows in df.iterrows():
    tokens = word_tokenize(rows.mov)
    for w in final_features:
      if w in tokens:
        rows[w] = 1
      else:
        rows[w] = 0
  return df

In [None]:
new_df = extract_features(df)

In [None]:
new_df.head()

Unnamed: 0,mov,lbl,happens,put,martin,lawrence,fat,suit,real,life,...,displayed,gift,mix,fashion,tongue-in-cheek,futuristic,military,recruitment,ad,downhill
0,what happens when you put martin lawrence in a...,neg,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,"take two old and dying men , a lifetime of reg...",neg,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,spoiled rich kid kelley morse ( chris klein ) ...,neg,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,jackie chan kicks his way into van damme terri...,neg,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,""" i'd rather die today and go to heaven than ...",pos,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train = new_df["lbl"]
x_train = new_df.drop(columns = ["lbl"])

In [None]:
print(type(x_train))
print(type(y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [None]:
y_train.head()

0    neg
1    neg
2    neg
3    neg
4    pos
Name: lbl, dtype: object

In [None]:
print(x_train.shape)

(2000, 2882)


In [None]:
print(y_train.shape)

(2000,)


In [None]:
x_train.drop(['mov'],inplace=True,axis=1)
# this needs to be removed here because then it will be treated as features. This becomes categorical data that we havent handled.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state= 42)

In [None]:
print(Y_test.shape)
print(X_test.shape)
print(X_train.shape)
print(Y_train.shape)

(400,)
(400, 2881)
(1600, 2881)
(1600,)


In [None]:
X_train.head()

Unnamed: 0,happens,put,martin,lawrence,fat,suit,real,life,?,get,...,displayed,gift,mix,fashion,tongue-in-cheek,futuristic,military,recruitment,ad,downhill
968,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
819,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
692,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
420,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
print(Y_train.head(15))

968     neg
240     neg
819     pos
692     neg
420     pos
1085    neg
1998    neg
365     neg
1022    pos
1240    neg
588     pos
736     pos
1761    pos
941     pos
71      pos
Name: lbl, dtype: object


In [None]:
print(Y_test.head(15))

1860    neg
353     pos
1333    neg
905     neg
1289    pos
1273    pos
938     pos
1731    neg
65      neg
1323    pos
56      neg
1292    pos
1118    pos
584     neg
374     neg
Name: lbl, dtype: object


In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

model = classifier.fit(X_train, Y_train)

In [None]:
preds = classifier.predict(X_test)
print("The predictions are:\n", preds[:15])

The predictions are:
 ['neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'pos' 'neg' 'neg']


In [None]:
print("The actual labels are: \n", Y_test[:15])

The actual labels are: 
 1860    neg
353     pos
1333    neg
905     neg
1289    pos
1273    pos
938     pos
1731    neg
65      neg
1323    pos
56      neg
1292    pos
1118    pos
584     neg
374     neg
Name: lbl, dtype: object


In [None]:
print(X_test.head())

     happens put martin lawrence fat suit real life  ? get  ... displayed  \
1860       1   0      0        0   0    0    0    0  0   0  ...         0   
353        0   1      0        0   0    0    0    1  0   1  ...         0   
1333       0   1      0        0   0    0    1    0  0   0  ...         0   
905        0   0      0        0   0    0    0    1  0   0  ...         0   
1289       0   0      0        0   0    0    0    1  0   1  ...         0   

     gift mix fashion tongue-in-cheek futuristic military recruitment ad  \
1860    0   0       0               0          0        0           0  0   
353     0   0       0               0          0        0           0  0   
1333    0   0       0               0          0        0           0  0   
905     0   1       0               0          0        0           0  0   
1289    0   0       0               0          0        0           0  0   

     downhill  
1860        0  
353         0  
1333        0  
905         0  


In [None]:
# Finding accuracy of our Naive Bayes classifier:
from sklearn.metrics import accuracy_score
print("Accuracy of our Naive Bayes classifier is:", accuracy_score(Y_test, preds) *100)

Accuracy of our Naive Bayes classifier is: 59.75


In [None]:
# In scikit-learn, the Gaussian Naive Bayes classifier (GaussianNB) does not have coefficients like linear models such as logistic regression.
# Naive Bayes classifiers do not have direct coefficients that indicate feature importance.

# The Naive Bayes classifier estimates class probabilities using the training data and calculates the likelihood of each feature given each class.
# These probabilities are combined using Bayes' theorem to make predictions.

# Therefore, you cannot retrieve coefficients from the GaussianNB classifier directly.
# Instead, you can access the class priors and class conditional probabilities estimated by the model.

In [None]:
class_priors = classifier.class_prior_

In [None]:
class_conditional_probs = classifier.theta_

In [None]:
print(classifier.classes_)

['neg' 'pos']


In [None]:
# Print out the class priors
print("Class priors:", class_priors)

# Print out the class conditional probabilities
print("Class conditional probabilities:", class_conditional_probs)

Class priors: [0.49625 0.50375]
Class conditional probabilities: [[0.11712846 0.18639798 0.04156171 ... 0.00125945 0.02896725 0.0163728 ]
 [0.0955335  0.1662531  0.06699752 ... 0.00124069 0.01861042 0.00496278]]


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, Y_train)

In [None]:
y_pred = logreg.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy of the Log Reg model is :", accuracy)

Accuracy of the Log Reg model is : 0.7825
