In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
stopwords = stopwords.words('english')
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [5]:
def clean_text(text):
    text = re.sub('<[^<]+?>', '', text)
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    # text = [ps.stem(word) for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]

    return text

In [6]:
dataset['cleaned_review'] = dataset['review'].apply(lambda x: clean_text(x.lower()))

In [7]:
dataset.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, family, little, boy, jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, matteis, love, time, money, visually,..."
5,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se..."
6,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ..."
7,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70, f..."
8,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comment, film, looking,..."
9,If you like original gut wrenching laughter yo...,positive,"[like, original, gut, wrenching, laughter, lik..."


In [8]:
train_features = dataset.loc[:30000, ['review', 'sentiment']]
train_features_df = pd.DataFrame(train_features)

test_features = dataset.loc[30000:, ['review', 'sentiment']]
test_features_df = pd.DataFrame(test_features)

In [9]:
train_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)
test_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)

In [10]:
train_features_df.shape, test_features_df.shape

((30001, 2), (20000, 2))

# Modular Approach

In [11]:
## All import 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

## Splitting data into train and test dataset

In [12]:

train_features = dataset.loc[:30000, ['review', 'sentiment']]
train_features_df = pd.DataFrame(train_features)

test_features = dataset.loc[30000:, ['review', 'sentiment']]
test_features_df = pd.DataFrame(test_features)

train_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)
test_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)

## Function to generate vectorized form of train_test dataset and vectorized form of whole dataset

In [13]:

def count_vectorizer_matrix(train_features_df, test_features_df):
    count_vector = CountVectorizer(analyzer=clean_text)
    c_train_features = count_vector.fit_transform(train_features_df['review'])
    c_test_features = count_vector.transform(test_features_df['review'])
    return c_train_features, c_test_features

def tfidf_vectorizer_matrix(train_features_df, test_features_df):
    tfidf_vector = TfidfVectorizer(analyzer=clean_text)
    tf_train_features = tfidf_vector.fit_transform(train_features_df['review'])
    tf_test_features = tfidf_vector.transform(test_features_df['review'])
    return tf_train_features, tf_test_features

def tfidf_vectorizer_whole_dataset(datatset):
    tf_vect = TfidfVectorizer(analyzer=clean_text)
    tf_X_featues = tf_vect.fit_transform(dataset['review'])
    return tf_X_featues
    
def count_vectorizer_whole_dataset(dataset):
    count_vect = CountVectorizer(analyzer=clean_text)
    count_X_featues = count_vect.fit_transform(dataset['review'])
    return count_X_featues

In [14]:
def calculate_preciosn_recall_fscore_accuracy(true_value, predicted_value):
    precison, recall, fscore, support = precision_recall_fscore_support([row for row in true_value['sentiment'].values.tolist()], predicted_value, average='binary')
    print(f'Precison - {np.round((precison * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((true_value['sentiment'].values == predicted_value).sum()/len(predicted_value))*100
    print(f"Accuracy - {accuracy}")

## Naive Bayes Approach

## Naive Bayes using Hold out dataset (Train-Test dataset)

In [15]:
def naive_bayes(train_features, train_dataframe, test_features):    
    model = MultinomialNB()
    model.fit(train_features, [row for row in train_dataframe['sentiment']])
    predicted_values = model.predict(test_features)
    return predicted_values

### Using Count Vectorizer

In [16]:
c_train_features, c_test_features = count_vectorizer_matrix(train_features_df, test_features_df)
nb_predicted_values = naive_bayes(c_train_features, train_features_df, c_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, nb_predicted_values)

Precison - 86.74
Recall - 83.7
Accuracy - 85.475


### Using TFIDF Vectorizer

In [17]:
tf_train_features, tf_test_features = tfidf_vectorizer_matrix(train_features_df, test_features_df)
nb_tf_predicted_values = naive_bayes(tf_train_features, train_features_df, tf_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, nb_tf_predicted_values)

Precison - 87.0
Recall - 84.71
Accuracy - 86.045


## Naive Bayes Using K_fold cross validation

In [18]:
def naive_bayes_k_fold(X_features, dataset):
    kfold_nb = MultinomialNB()
    kfold = KFold(n_splits=5)
    result = cross_val_score(kfold_nb, X_features, dataset['sentiment'], cv=kfold, n_jobs=-1, scoring='accuracy')
    print(f'Average Accuracy - {(result.sum()/len(result))*100}')

### Using Count Vectorizer

In [19]:
count_X_features = count_vectorizer_whole_dataset(dataset)
naive_bayes_k_fold(count_X_features, dataset)

Average Accuracy - 85.86600000000001


### Using TF-IDF Vectorizer

In [20]:
tfidf_X_features = tfidf_vectorizer_whole_dataset(dataset)
naive_bayes_k_fold(tfidf_X_features, dataset)

Average Accuracy - 86.53800000000001


## Hyper Paramter Tuning for Naive Bayes model using alpha as hyper parameter

<p>
    alpha : float, default=1.0
    Additive (Laplace/Lidstone) smoothing parameter
    (0 for no smoothing).
</p>

In [21]:
def grid_search(alpha, train_features, test_features, train_features_df, test_features_df):
    grid_nb = MultinomialNB(alpha=alpha)
    grid_nb_model = grid_nb.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_nb_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_y, average='binary')
    print(f"For alpha - {alpha}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

### Using cout vectorizer 

In [22]:
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha, c_train_features, c_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 86.74
Recall - 83.7
Accuracy - 85.475
****************************************************************************************************
For alpha - 2
Precison - 86.57
Recall - 83.6
Accuracy - 85.33500000000001
****************************************************************************************************
For alpha - 3
Precison - 86.45
Recall - 83.55
Accuracy - 85.245
****************************************************************************************************
For alpha - 4
Precison - 86.36
Recall - 83.55
Accuracy - 85.195
****************************************************************************************************
For alpha - 5
Precison - 86.17
Recall - 83.57
Accuracy - 85.1
****************************************************************************************************


### Using TF-IDF vectorizer

In [23]:
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha, tf_train_features, tf_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 87.0
Recall - 84.71
Accuracy - 86.045
****************************************************************************************************
For alpha - 2
Precison - 87.47
Recall - 84.11
Accuracy - 86.05000000000001
****************************************************************************************************
For alpha - 3
Precison - 87.9
Recall - 83.47
Accuracy - 86.00999999999999
****************************************************************************************************
For alpha - 4
Precison - 88.02
Recall - 82.91
Accuracy - 85.835
****************************************************************************************************
For alpha - 5
Precison - 88.22
Recall - 82.54
Accuracy - 85.78
****************************************************************************************************


## Random Forest Approach

## Random Forest using Hold Out(Train/Test dataset) dataset

In [24]:
def random_forest(train_features, train_dataframe, test_features):    
    rf_model = RandomForestClassifier(n_jobs=-1)
    rf_model.fit(train_features, [row for row in train_dataframe['sentiment']])
    predicted_values = rf_model.predict(test_features)
    return predicted_values

### Using count vecorizer

In [25]:
count_rf_predicted_values = random_forest(c_train_features, train_features_df, c_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, count_rf_predicted_values)

Precison - 85.29
Recall - 85.22
Accuracy - 85.285


### Using TF-IDF Vectorizer

In [26]:
tf_rf_predicted_values = random_forest(tf_train_features, train_features_df, tf_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, tf_rf_predicted_values)

Precison - 85.39
Recall - 84.7
Accuracy - 85.125


## Random forest using K_Fold cross validation

In [27]:
def random_forest_k_fold(X_features, dataset):
    rf_kfold = RandomForestClassifier(n_jobs=-1)
    kfold = KFold(n_splits=5)
    result = cross_val_score(rf_kfold, X_features, dataset['sentiment'], cv=kfold, n_jobs=-1, scoring='accuracy')
    print(f'Average Accuracy - {(result.sum()/len(result))*100}')

### Using count vectorizer

In [28]:
random_forest_k_fold(count_X_features, dataset)

Average Accuracy - 85.54599999999999


### Using TF-IDF vectorizer

In [29]:
random_forest_k_fold(tfidf_X_features, dataset)

Average Accuracy - 85.13


## Hyper Parameter Tuning using grid search

In [30]:
def grid_search_random_forest(estimator, depth, train_features, test_features, train_features_df, test_features_df):
    grid_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=-1)
    grid_rf_model = grid_rf.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_rf_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_y, average='binary')
    print(f"For estimator - {estimator} and depth - {depth}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

### Using count vectorizer

In [31]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        grid_search_random_forest(est, dep, c_train_features, c_test_features, train_features_df, test_features_df)

For estimator - 100 and depth - 10
Precison - 80.66
Recall - 87.41
Accuracy - 83.25
****************************************************************************************************
For estimator - 100 and depth - 20
Precison - 81.4
Recall - 87.06
Accuracy - 83.60499999999999
****************************************************************************************************
For estimator - 100 and depth - 30
Precison - 82.93
Recall - 87.67
Accuracy - 84.83500000000001
****************************************************************************************************
For estimator - 100 and depth - 50
Precison - 83.82
Recall - 86.57
Accuracy - 84.95
****************************************************************************************************
For estimator - 100 and depth - None
Precison - 85.21
Recall - 84.8
Accuracy - 85.06
****************************************************************************************************
For estimator - 200 and depth - 10
Precison - 80.2


### Using TF-IDF vectorizer

In [32]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        grid_search_random_forest(est, dep, tf_train_features, tf_test_features, train_features_df, test_features_df)

For estimator - 100 and depth - 10
Precison - 79.55
Recall - 87.38
Accuracy - 82.485
****************************************************************************************************
For estimator - 100 and depth - 20
Precison - 81.94
Recall - 86.88
Accuracy - 83.89
****************************************************************************************************
For estimator - 100 and depth - 30
Precison - 82.63
Recall - 86.3
Accuracy - 84.10499999999999
****************************************************************************************************
For estimator - 100 and depth - 50
Precison - 83.79
Recall - 86.07
Accuracy - 84.735
****************************************************************************************************
For estimator - 100 and depth - None
Precison - 85.45
Recall - 84.68
Accuracy - 85.15
****************************************************************************************************
For estimator - 200 and depth - 10
Precison - 80.17
Recall -

## Hyper Parameter Tuning using grid search CV

In [33]:
def randomForest_Kfold_cv(estimator, depth, X_features, dataset):
    kf_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=-1)
    kf = KFold(n_splits=5)
    cv_results = cross_val_score(kf_rf, X_features, dataset['sentiment'], n_jobs=-1, cv=kf)
    print(f'For estimator {estimator}  and depth {depth}')
    print(f'Average Accuracy - {(cv_results.sum()/len(cv_results))*100}')
    print("*"*100)

### Using count vectorizer

In [34]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        randomForest_Kfold_cv(est, dep, count_X_features, dataset)

For estimator 100  and depth 10
Average Accuracy - 82.38999999999999
****************************************************************************************************
For estimator 100  and depth 20
Average Accuracy - 84.27600000000001
****************************************************************************************************
For estimator 100  and depth 30
Average Accuracy - 84.80400000000002
****************************************************************************************************
For estimator 100  and depth 50
Average Accuracy - 85.23399999999998
****************************************************************************************************
For estimator 100  and depth None
Average Accuracy - 85.68599999999999
****************************************************************************************************
For estimator 200  and depth 10
Average Accuracy - 83.75399999999999
*******************************************************************************

### Using TF-IDF vectorizer

In [35]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        randomForest_Kfold_cv(est, dep, tfidf_X_features, dataset)

For estimator 100  and depth 10
Average Accuracy - 82.10000000000001
****************************************************************************************************
For estimator 100  and depth 20
Average Accuracy - 83.34400000000001
****************************************************************************************************
For estimator 100  and depth 30
Average Accuracy - 84.19000000000001
****************************************************************************************************
For estimator 100  and depth 50
Average Accuracy - 84.78000000000002
****************************************************************************************************
For estimator 100  and depth None
Average Accuracy - 85.14600000000002
****************************************************************************************************
For estimator 200  and depth 10
Average Accuracy - 82.97799999999998
*******************************************************************************

# Final Model Building

For final modeling, we will use Count vectorizer with Naive Bayes Algorithm

In [36]:
def model(train_features_df):
    count_vector = CountVectorizer(analyzer=clean_text)
    c_train_features = count_vector.fit_transform(train_features_df['review'])
    nb_model = MultinomialNB()
    nb_fit = nb_model.fit(c_train_features, [row for row in train_features_df['sentiment']])
    return count_vector, nb_fit

In [37]:
def review_classifier(review, vector, model_obj):
    review_df  = pd.DataFrame([review], columns=['review'])
    test_features = vector.transform(review_df['review'])
    result = model_obj.predict(test_features)
    if result == 1:
        print("Given review shows positive sentiment")
    elif result == -1:
        print('Given review shows negative sentiment')

In [38]:
tf_vector, nb_model = model(train_features_df)

In [39]:
test_review = '''This film has everything you want in a team up super hero film from awesome action sequences to great characters but still has some noticeable flaws.

The fight choreography was great not because of john wick level action sequences but just really fun and enjoyable action scemes that really embrace comic book fun.
The villain loki is perfect as although he may have a generic motavation, the way clashes with the heroes ideologies and has huge amounts of personality made him unforgettable. 

One of my favourite things about this movie is the sense of urgency and dread, it really does feel like anything could happen and some thing could go wrong and this is due to some great writing and acting. another thing i love is that the characters work so well with each oher and are handled so well, the best avengers (not in power but in character complexity) are all focused on more than the more boring characters which is awesome as it feels no potential was lost in the characters and it shows they actually put thought into there characters.

This movie isnt perfect though as the cinematography which was dissapointing and felt really bland and ugly to look at as it felt like they dont use there unique set design.

Overall this movie still holds up as one of the best and most unforgettable mcu experiences but was still dissapionting in some aspects.'''
test_review

'This film has everything you want in a team up super hero film from awesome action sequences to great characters but still has some noticeable flaws.\n\nThe fight choreography was great not because of john wick level action sequences but just really fun and enjoyable action scemes that really embrace comic book fun.\nThe villain loki is perfect as although he may have a generic motavation, the way clashes with the heroes ideologies and has huge amounts of personality made him unforgettable. \n\nOne of my favourite things about this movie is the sense of urgency and dread, it really does feel like anything could happen and some thing could go wrong and this is due to some great writing and acting. another thing i love is that the characters work so well with each oher and are handled so well, the best avengers (not in power but in character complexity) are all focused on more than the more boring characters which is awesome as it feels no potential was lost in the characters and it sho

In [40]:
review_classifier(test_review, tf_vector, nb_model)

Given review shows positive sentiment


In [41]:
bad_test_review = '''Oh god! I've never seen an overrated film like Tees maar Khan. Even imbd had given it something like 2.5 out of 10 overacting, flop comedic scenes and an useless script just ruined the film completely. And J can't understand, what has happened to our Indian audience?Some of the people are even saying that "if you don't like this movie you have a terrible taste!" I mean are you being serious? What can I say these types of movies perfectly suits Indian audience. Anyone who thinks this movie is great first watch movies like wolf of Wall Street, django unchained, inception and then watch these films, you'll understand the difference between a good film and a n average film'''
bad_test_review

'Oh god! I\'ve never seen an overrated film like Tees maar Khan. Even imbd had given it something like 2.5 out of 10 overacting, flop comedic scenes and an useless script just ruined the film completely. And J can\'t understand, what has happened to our Indian audience?Some of the people are even saying that "if you don\'t like this movie you have a terrible taste!" I mean are you being serious? What can I say these types of movies perfectly suits Indian audience. Anyone who thinks this movie is great first watch movies like wolf of Wall Street, django unchained, inception and then watch these films, you\'ll understand the difference between a good film and a n average film'

In [42]:
review_classifier(bad_test_review, tf_vector, nb_model)

Given review shows negative sentiment
