In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
stopwords = stopwords.words('english')
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [5]:
def clean_text(text):
    text = re.sub('<[^<]+?>', '', text)
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    # text = [ps.stem(word) for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]

    return text

In [6]:
dataset['cleaned_review'] = dataset['review'].apply(lambda x: clean_text(x.lower()))

In [7]:
dataset.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, family, little, boy, jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, matteis, love, time, money, visually,..."
5,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se..."
6,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ..."
7,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70, f..."
8,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comment, film, looking,..."
9,If you like original gut wrenching laughter yo...,positive,"[like, original, gut, wrenching, laughter, lik..."


In [8]:
train_features = dataset.loc[:30000, ['review', 'sentiment']]
train_features_df = pd.DataFrame(train_features)

test_features = dataset.loc[30000:, ['review', 'sentiment']]
test_features_df = pd.DataFrame(test_features)

In [9]:
train_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)
test_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)

In [10]:
train_features_df.shape, test_features_df.shape

((30001, 2), (20000, 2))

# Implementing Naive Bayes Algorithm

## Count Vectorizer and Naive Bayes using Hold Out(Train/Test dataset) dataset

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer(tokenizer=clean_text)
train_features = countVectorizer.fit_transform([row for row in train_features_df['review']])
train_features

<30001x150616 sparse matrix of type '<class 'numpy.int64'>'
	with 2936353 stored elements in Compressed Sparse Row format>

In [12]:
test_features = countVectorizer.transform([row for row in test_features_df['review']])

In [13]:
test_features_df['sentiment'].shape

(20000,)

In [14]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_features, [row for row in train_features_df['sentiment']])

MultinomialNB()

In [15]:
predicted_values = model.predict(test_features)

In [16]:
predicted_values.shape

(20000,)

In [17]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
cm = confusion_matrix([row for row in test_features_df['sentiment'].values.tolist()], predicted_values)
cm

array([[8738, 1277],
       [1628, 8357]], dtype=int64)

In [18]:
precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_values, average='binary')

In [19]:
print(f'Precison - {np.round((precision*100), 2)} \nRecall - {np.round((recall *100), 2)}')

Precison - 86.74 
Recall - 83.7


#### Accuracy 

In [20]:
np.round(((predicted_values == test_features_df['sentiment'].values).sum()/ len(test_features_df))*100, 2)

85.48

## Count Vectorizer and Naive Bayes using K-fold test dataset(Train/Test dataset) dataset

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer_kfold = CountVectorizer(analyzer=clean_text)
X_features = countVectorizer_kfold.fit_transform(dataset['review'])
X_features

<50000x210170 sparse matrix of type '<class 'numpy.int64'>'
	with 4894295 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
kfold_nb = MultinomialNB()
kfold = KFold(n_splits=5)
cross_val_score(kfold_nb, X_features, dataset['sentiment'], cv=kfold, n_jobs=-1, scoring='accuracy')

array([0.8587, 0.8631, 0.8561, 0.8564, 0.859 ])

## TFIDF Vectorizer and Naive Bayes using Hold Out(Train/Test dataset) dataset

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vector = TfidfVectorizer(analyzer=clean_text)
tf_train_features = tf_idf_vector.fit_transform(train_features_df['review'])
tf_train_features

<30001x150616 sparse matrix of type '<class 'numpy.float64'>'
	with 2936353 stored elements in Compressed Sparse Row format>

In [25]:
tf_test_features = tf_idf_vector.transform(test_features_df['review'])

In [26]:
from sklearn.naive_bayes import MultinomialNB
tf_nb = MultinomialNB()
tf_nb.fit(tf_train_features, [row for row in train_features_df['sentiment']])

MultinomialNB()

In [27]:
pred_y = tf_nb.predict(test_features)
pred_y

array([-1, -1, -1, ..., -1, -1, -1])

In [28]:
tf_cm = confusion_matrix([row for row in test_features_df['sentiment'].values.tolist()], pred_y)
tf_cm

array([[9033,  982],
       [1883, 8102]], dtype=int64)

In [29]:
tf_precision, tf_recall, tf_fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], pred_y, average='binary')

In [30]:
print(f'Precison - {np.round((tf_precision*100), 2)} \nRecall - {np.round((tf_recall *100), 2)}')

Precison - 89.19 
Recall - 81.14


#### Accuracy using TF_IDF

In [31]:
np.round(((pred_y == test_features_df['sentiment'].values).sum()/ len(test_features_df))*100, 2)

85.68

## TFIDF Vectorizer and Naive Bayes using K-Fold, cross_validation dataset

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
k_tf_vectorizer = TfidfVectorizer(analyzer=clean_text)
tf_kf_X_features = k_tf_vectorizer.fit_transform(dataset['review'])
tf_kf_X_features

<50000x210170 sparse matrix of type '<class 'numpy.float64'>'
	with 4894295 stored elements in Compressed Sparse Row format>

In [33]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
tf_kf_nb = MultinomialNB()
tf_kf_kFold = KFold(n_splits=5)
cross_val_score(tf_kf_nb, tf_kf_X_features, dataset['sentiment'], cv=tf_kf_kFold, scoring='accuracy', n_jobs=-1)

array([0.8664, 0.8724, 0.8628, 0.8594, 0.8659])

## Hyperparamter Tuning using Grid Search

In [34]:
from sklearn.naive_bayes import MultinomialNB
def grid_search(alpha, train_features, test_features, train_features_df, test_features_df):
    grid_nb = MultinomialNB(alpha=alpha)
    grid_nb_model = grid_nb.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_nb_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_values, average='binary')
    print(f"For alpha - {alpha}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def count_vectorizer_matrix(train_features_df, test_features_df):
    count_vector = CountVectorizer(analyzer=clean_text)
    c_train_features = count_vector.fit_transform(train_features_df['review'])
    c_test_features = count_vector.transform(test_features_df['review'])
    return c_train_features, c_test_features

def tfidf_vectorizer_matrix(train_features_df, test_features_df):
    tfidf_vector = TfidfVectorizer(analyzer=clean_text)
    tf_train_features = tfidf_vector.fit_transform(train_features_df['review'])
    tf_test_features = tfidf_vector.transform(test_features_df['review'])
    return tf_train_features, tf_test_features    

In [36]:
c_train_features, c_test_features = count_vectorizer_matrix(train_features_df, test_features_df)
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha,c_train_features, c_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 86.74
Recall - 83.7
Accuracy - 85.475
****************************************************************************************************
For alpha - 2
Precison - 86.74
Recall - 83.7
Accuracy - 85.33500000000001
****************************************************************************************************
For alpha - 3
Precison - 86.74
Recall - 83.7
Accuracy - 85.245
****************************************************************************************************
For alpha - 4
Precison - 86.74
Recall - 83.7
Accuracy - 85.195
****************************************************************************************************
For alpha - 5
Precison - 86.74
Recall - 83.7
Accuracy - 85.1
****************************************************************************************************


In [37]:
tf_train_features,  tf_test_features = tfidf_vectorizer_matrix(train_features_df, test_features_df)
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha,tf_train_features, tf_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 86.74
Recall - 83.7
Accuracy - 86.045
****************************************************************************************************
For alpha - 2
Precison - 86.74
Recall - 83.7
Accuracy - 86.05000000000001
****************************************************************************************************
For alpha - 3
Precison - 86.74
Recall - 83.7
Accuracy - 86.00999999999999
****************************************************************************************************
For alpha - 4
Precison - 86.74
Recall - 83.7
Accuracy - 85.835
****************************************************************************************************
For alpha - 5
Precison - 86.74
Recall - 83.7
Accuracy - 85.78
****************************************************************************************************


# Implementing Random Forest for movie review classification

## Count Vectorizer and Random Forest using Hold Out(Train/Test dataset) dataset

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

rf_countVect = CountVectorizer(analyzer=clean_text)
rf_c_train_features = rf_countVect.fit_transform(train_features_df['review'])
rf_c_test_features = rf_countVect.transform(test_features_df['review'])

In [40]:
c_rf = RandomForestClassifier(max_depth=20, n_jobs=-1)
c_rf_fit = c_rf.fit(rf_c_train_features, train_features_df['sentiment'])

In [41]:
c_rf_pred_y = c_rf_fit.predict(rf_c_test_features)

In [42]:
c_rf_pred_y

array([ 1, -1, -1, ...,  1,  1, -1], dtype=int64)

In [43]:
c_rf_confusion_matrix = confusion_matrix([row for row in test_features_df['sentiment'].values.tolist()], c_rf_pred_y)
c_rf_confusion_matrix

array([[8076, 1939],
       [1267, 8718]], dtype=int64)

In [44]:
c_rf_precison, c_rf_recall, c_rf_fscore, c_rf_support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], c_rf_pred_y, average='binary')
print(f'Precison - {np.round((c_rf_precison * 100), 2)}')
print(f'Recall - {np.round((c_rf_recall * 100), 2)}')
c_rf_accuracy = ((test_features_df['sentiment'].values == c_rf_pred_y).sum()/len(c_rf_pred_y))*100
print(f"Accuracy - {c_rf_accuracy}")

Precison - 81.81
Recall - 87.31
Accuracy - 83.97


## Count Vectorizer and Random Forest using K-Fold and Cross Validation

In [45]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [46]:
c_rf_kf_countVect = CountVectorizer(analyzer=clean_text)
c_rf_X_features = c_rf_kf_countVect.fit_transform(dataset['review'])
c_kf_rf = RandomForestClassifier(max_depth=20, n_jobs=-1)
c_kf_kfold = KFold(n_splits=5)
cross_val_score(c_kf_rf, c_rf_X_features, dataset['sentiment'], cv=c_kf_kfold, n_jobs=-1)

array([0.8477, 0.8381, 0.8396, 0.8416, 0.8412])

## TFIDF Vectorizer and Random Forest using Hold Out(Train/Test dataset) dataset

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_rf_tfVector = TfidfVectorizer(analyzer=clean_text)
tf_rf_train_features = tf_rf_tfVector.fit_transform(train_features_df['review'])
tf_rf_test_features = tf_rf_tfVector.transform(test_features_df['review'])

In [48]:
from sklearn.ensemble import RandomForestClassifier
tf_rf = RandomForestClassifier(max_depth=20, n_jobs=-1)
tf_rf_fit = tf_rf.fit(tf_rf_train_features, train_features_df['sentiment'])

In [49]:
tf_rf_predicted_y = tf_rf_fit.predict(tf_rf_test_features)
tf_rf_predicted_y

array([ 1, -1, -1, ..., -1, -1, -1], dtype=int64)

In [50]:
tf_rf_confusion_matrix = confusion_matrix([row for row in test_features_df['sentiment'].values.tolist()], tf_rf_predicted_y)
tf_rf_confusion_matrix

array([[8085, 1930],
       [1362, 8623]], dtype=int64)

In [51]:
tf_rf_precison, tf_rf_recall, tf_rf_fscore, tf_rf_support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], tf_rf_predicted_y, average='binary')
print(f'Precison - {np.round((tf_rf_precison * 100), 2)}')
print(f'Recall - {np.round((tf_rf_recall * 100), 2)}')
tf_rf_accuracy = ((test_features_df['sentiment'].values == tf_rf_predicted_y).sum()/len(tf_rf_predicted_y))*100
print(f"Accuracy - {tf_rf_accuracy}")

Precison - 81.71
Recall - 86.36
Accuracy - 83.54


## TFIDF Vectorizer and Random Forest using K-Fold and Cross Validation

In [52]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [53]:
tf_rf_kf_tfVect = TfidfVectorizer(analyzer=clean_text)
tf_rf_kf_X_features = tf_rf_kf_tfVect.fit_transform(dataset['review'])
tf_kf_rf = RandomForestClassifier(max_depth=20, n_jobs=-1)
tf_kf_kfold = KFold(n_splits=5)
cross_val_score(tf_kf_rf, tf_rf_kf_X_features, dataset['sentiment'], cv=tf_kf_kfold, n_jobs=-1)

array([0.8411, 0.8271, 0.8357, 0.8333, 0.8354])

## Hyper Parameter Tuning using grid search

In [54]:
from sklearn.ensemble import RandomForestClassifier
def grid_search_random_forest(estimator, depth, train_features, test_features, train_features_df, test_features_df):
    grid_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=-1)
    grid_rf_model = grid_rf.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_rf_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_values, average='binary')
    print(f"For estimator - {estimator} and depth - {depth}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

In [55]:
c_rf_train_features, c_rf_test_features = count_vectorizer_matrix(train_features_df, test_features_df)
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        grid_search_random_forest(est, dep, c_rf_train_features, c_rf_test_features, train_features_df, test_features_df)

For estimator - 100 and depth - 10
Precison - 86.74
Recall - 83.7
Accuracy - 82.915
****************************************************************************************************
For estimator - 100 and depth - 20
Precison - 86.74
Recall - 83.7
Accuracy - 83.67999999999999
****************************************************************************************************
For estimator - 100 and depth - 30
Precison - 86.74
Recall - 83.7
Accuracy - 84.52
****************************************************************************************************
For estimator - 100 and depth - 50
Precison - 86.74
Recall - 83.7
Accuracy - 84.77
****************************************************************************************************
For estimator - 100 and depth - None
Precison - 86.74
Recall - 83.7
Accuracy - 85.1
****************************************************************************************************
For estimator - 200 and depth - 10
Precison - 86.74
Recall - 83.7


## Hyper Parameter Tuning using gridsearchcv

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import KFold, cross_val_score

In [60]:
def randomForest_Kfold(estimator, depth, X_features, dataset):
    kf_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth)
    kf = KFold(n_splits=5)
    cv_results = cross_val_score(kf_rf, X_features, dataset['sentiment'], n_jobs=-1, cv=kf)
    print(f'For estimator {estimator}  and depth {depth}')
    print(cv_results)
    print("*"*100)

### CounterVecorizer

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
gcv_countVector = CountVectorizer(analyzer=clean_text)
gcv_c_X_features = gcv_countVector.fit_transform(dataset['review'])

In [62]:
for est in [100, 200, 300]:
    for depth in [10, 30, 50, 90, None]:
        randomForest_Kfold(est, depth, gcv_c_X_features, dataset)

For estimator 100  and depth 10
[0.8346 0.8087 0.8353 0.8339 0.8286]
****************************************************************************************************
For estimator 100  and depth 30
[0.8551 0.8472 0.8432 0.8429 0.8449]
****************************************************************************************************
For estimator 100  and depth 50
[0.8562 0.8523 0.8503 0.859  0.8532]
****************************************************************************************************
For estimator 100  and depth 90
[0.86   0.8534 0.8515 0.8538 0.853 ]
****************************************************************************************************
For estimator 100  and depth None
[0.8594 0.8557 0.8506 0.8544 0.854 ]
****************************************************************************************************
For estimator 200  and depth 10
[0.8428 0.8285 0.8379 0.8372 0.846 ]
*******************************************************************************

### TF-IDF Vectorizer

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
gcv_tfVector = TfidfVectorizer(analyzer=clean_text)
gcv_tf_X_features = gcv_tfVector.fit_transform(dataset['review'])

In [66]:
for est in [100, 200, 300]:
    for depth in [10, 30, 50, 90, None]:
        randomForest_Kfold(est, depth, gcv_tf_X_features, dataset)

For estimator 100  and depth 10
[0.8236 0.8117 0.819  0.8174 0.8277]
****************************************************************************************************
For estimator 100  and depth 30
[0.8507 0.8398 0.8413 0.8392 0.8451]
****************************************************************************************************
For estimator 100  and depth 50
[0.8527 0.8525 0.8449 0.8444 0.8455]
****************************************************************************************************
For estimator 100  and depth 90
[0.8558 0.8538 0.8473 0.8493 0.8519]
****************************************************************************************************
For estimator 100  and depth None
[0.8507 0.8562 0.8456 0.8552 0.8541]
****************************************************************************************************
For estimator 200  and depth 10
[0.8339 0.8281 0.8361 0.8364 0.843 ]
*******************************************************************************

# Modular Approach

In [102]:
## All import 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

## Splitting data into train and test dataset

In [69]:

train_features = dataset.loc[:30000, ['review', 'sentiment']]
train_features_df = pd.DataFrame(train_features)

test_features = dataset.loc[30000:, ['review', 'sentiment']]
test_features_df = pd.DataFrame(test_features)

train_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)
test_features_df['sentiment'].replace({'positive': 1, 'negative': -1}, inplace=True)

## Function to generate vectorized form of train_test dataset and vectorized form of whole dataset

In [87]:

def count_vectorizer_matrix(train_features_df, test_features_df):
    count_vector = CountVectorizer(analyzer=clean_text)
    c_train_features = count_vector.fit_transform(train_features_df['review'])
    c_test_features = count_vector.transform(test_features_df['review'])
    return c_train_features, c_test_features

def tfidf_vectorizer_matrix(train_features_df, test_features_df):
    tfidf_vector = TfidfVectorizer(analyzer=clean_text)
    tf_train_features = tfidf_vector.fit_transform(train_features_df['review'])
    tf_test_features = tfidf_vector.transform(test_features_df['review'])
    return tf_train_features, tf_test_features

def tfidf_vectorizer_whole_dataset(datatset):
    tf_vect = TfidfVectorizer(analyzer=clean_text)
    tf_X_featues = tf_vect.fit_transform(dataset['review'])
    return tf_X_featues
    
def count_vectorizer_whole_dataset(dataset):
    count_vect = CountVectorizer(analyzer=clean_text)
    count_X_featues = count_vect.fit_transform(dataset['review'])
    return count_X_featues

In [167]:
def calculate_preciosn_recall_fscore_accuracy(true_value, predicted_value):
    precison, recall, fscore, support = precision_recall_fscore_support([row for row in true_value['sentiment'].values.tolist()], predicted_value, average='binary')
    print(f'Precison - {np.round((precison * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((true_value['sentiment'].values == predicted_value).sum()/len(predicted_value))*100
    print(f"Accuracy - {accuracy}")

## Naive Bayes Approach

## Naive Bayes using Hold out dataset (Train-Test dataset)

In [168]:
def naive_bayes(train_features, train_dataframe, test_features):    
    model = MultinomialNB()
    model.fit(train_features, [row for row in train_dataframe['sentiment']])
    predicted_values = model.predict(test_features)
    return predicted_values

### Using Count Vectorizer

In [None]:
c_train_features, c_test_features = count_vectorizer_matrix(train_features_df, test_features_df)
nb_predicted_values = naive_bayes(c_train_features, train_features_df, c_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, nb_predicted_values)

### Using TFIDF Vectorizer

In [79]:
tf_train_features, tf_test_features = tfidf_vectorizer_matrix(train_features_df, test_features_df)
nb_tf_predicted_values = naive_bayes(tf_train_features, train_features_df, tf_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, nb_tf_predicted_values)

Precison - 87.0
Recall - 84.71
Accuracy - 83.54


## Naive Bayes Using K_fold cross validation

In [93]:
def naive_bayes_k_fold(X_features, dataset):
    kfold_nb = MultinomialNB()
    kfold = KFold(n_splits=5)
    result = cross_val_score(kfold_nb, X_features, dataset['sentiment'], cv=kfold, n_jobs=-1, scoring='accuracy')
    print(f'Average Accuracy - {(result.sum()/len(result))*100}')

### Using Count Vectorizer

In [94]:
count_X_features = count_vectorizer_whole_dataset(dataset)
naive_bayes_k_fold(count_X_features, dataset)

Average Accuracy - 85.86600000000001


### Using TF-IDF Vectorizer

In [95]:
tfidf_X_features = tfidf_vectorizer_whole_dataset(dataset)
naive_bayes_k_fold(tfidf_X_features, dataset)

Average Accuracy - 86.53800000000001


## Hyper Paramter Tuning for Naive Bayes model using alpha as hyper parameter

<p>
    alpha : float, default=1.0
    Additive (Laplace/Lidstone) smoothing parameter
    (0 for no smoothing).
</p>

In [96]:
def grid_search(alpha, train_features, test_features, train_features_df, test_features_df):
    grid_nb = MultinomialNB(alpha=alpha)
    grid_nb_model = grid_nb.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_nb_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_values, average='binary')
    print(f"For alpha - {alpha}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

### Using cout vectorizer 

In [99]:
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha, c_train_features, c_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 86.74
Recall - 83.7
Accuracy - 85.475
****************************************************************************************************
For alpha - 2
Precison - 86.74
Recall - 83.7
Accuracy - 85.33500000000001
****************************************************************************************************
For alpha - 3
Precison - 86.74
Recall - 83.7
Accuracy - 85.245
****************************************************************************************************
For alpha - 4
Precison - 86.74
Recall - 83.7
Accuracy - 85.195
****************************************************************************************************
For alpha - 5
Precison - 86.74
Recall - 83.7
Accuracy - 85.1
****************************************************************************************************


### Using TF-IDF vectorizer

In [100]:
for alpha in [1, 2, 3, 4, 5]:
    grid_search(alpha, tf_train_features, tf_test_features, train_features_df, test_features_df)

For alpha - 1
Precison - 86.74
Recall - 83.7
Accuracy - 86.045
****************************************************************************************************
For alpha - 2
Precison - 86.74
Recall - 83.7
Accuracy - 86.05000000000001
****************************************************************************************************
For alpha - 3
Precison - 86.74
Recall - 83.7
Accuracy - 86.00999999999999
****************************************************************************************************
For alpha - 4
Precison - 86.74
Recall - 83.7
Accuracy - 85.835
****************************************************************************************************
For alpha - 5
Precison - 86.74
Recall - 83.7
Accuracy - 85.78
****************************************************************************************************


<p> As we are increasing the laplace smoothning parameter or alpha, we are getting decrement in accuracy </p>

## Random Forest Approach

## Random Forest using Hold Out(Train/Test dataset) dataset

In [103]:
def random_forest(train_features, train_dataframe, test_features):    
    rf_model = RandomForestClassifier(n_jobs=-1)
    rf_model.fit(train_features, [row for row in train_dataframe['sentiment']])
    predicted_values = model.predict(test_features)
    return predicted_values

### Using count vecorizer

In [104]:
count_rf_predicted_values = random_forest(c_train_features, train_features_df, c_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, count_rf_predicted_values)

Precison - 86.74
Recall - 83.7
Accuracy - 83.54


### Using TF-IDF Vectorizer

In [105]:
tf_rf_predicted_values = random_forest(tf_train_features, train_features_df, tf_test_features)
calculate_preciosn_recall_fscore_accuracy(test_features_df, tf_rf_predicted_values)

Precison - 84.04
Recall - 86.22
Accuracy - 83.54


## Random forest using K_Fold cross validation

In [108]:
def random_forest_k_fold(X_features, dataset):
    rf_kfold = RandomForestClassifier(n_jobs=-1)
    kfold = KFold(n_splits=5)
    result = cross_val_score(rf_kfold, X_features, dataset['sentiment'], cv=kfold, n_jobs=-1, scoring='accuracy')
    print(f'Average Accuracy - {(result.sum()/len(result))*100}')

### Using count vectorizer

In [109]:
random_forest_k_fold(count_X_features, dataset)

Average Accuracy - 85.448


### Using TF-IDF vectorizer

In [111]:
random_forest_k_fold(tfidf_X_features, dataset)

Average Accuracy - 84.906


## Hyper Parameter Tuning using grid search

In [112]:
def grid_search_random_forest(estimator, depth, train_features, test_features, train_features_df, test_features_df):
    grid_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=-1)
    grid_rf_model = grid_rf.fit(train_features, [row for row in train_features_df['sentiment']])
    predicted_y = grid_rf_model.predict(test_features)
    precision, recall, fscore, support = precision_recall_fscore_support([row for row in test_features_df['sentiment'].values.tolist()], predicted_values, average='binary')
    print(f"For estimator - {estimator} and depth - {depth}")
    print(f'Precison - {np.round((precision * 100), 2)}')
    print(f'Recall - {np.round((recall * 100), 2)}')
    accuracy = ((test_features_df['sentiment'].values == predicted_y).sum()/len(predicted_y))*100
    print(f"Accuracy - {accuracy}")
    print("*"*100)

### Using count vectorizer

In [None]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        grid_search_random_forest(est, dep, c_train_features, c_test_features, train_features_df, test_features_df)

### Using TF-IDF vectorizer

In [None]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        grid_search_random_forest(est, dep, tf_train_features, tf_test_features, train_features_df, test_features_df)

## Hyper Parameter Tuning using grid search CV

In [None]:
def randomForest_Kfold_cv(estimator, depth, X_features, dataset):
    kf_rf = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=-1)
    kf = KFold(n_splits=5)
    cv_results = cross_val_score(kf_rf, X_features, dataset['sentiment'], n_jobs=-1, cv=kf)
    print(f'For estimator {estimator}  and depth {depth}')
    print(f'Average Accuracy - {(cv_results.sum()/len(cv_results))*100}')
    print("*"*100)

### Using count vectorizer

In [None]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        randomForest_Kfold_cv(est, dep, count_X_features, dataset)

### Using TF-IDF vectorizer

In [None]:
for est in [100, 200, 300]:
    for dep in [10, 20, 30, 50, None]:
        randomForest_Kfold_cv(est, dep, tfidf_X_features, dataset)

# Final Model Building

For final modeling, we will use Count vectorizer with Naive Bayes Algorithm

In [159]:
def model(train_features_df):
    count_vector = CountVectorizer(analyzer=clean_text)
    c_train_features = count_vector.fit_transform(train_features_df['review'])
    nb_model = MultinomialNB()
    nb_fit = nb_model.fit(c_train_features, [row for row in train_features_df['sentiment']])
    return count_vector, nb_fit

In [160]:
def review_classifier(review, vector, model_obj):
    review_df  = pd.DataFrame([review], columns=['review'])
    test_features = vector.transform(review_df['review'])
    result = model_obj.predict(test_features)
    if result == 1:
        print("Given review shows positive sentiment")
    elif result == -1:
        print('Given review shows negative sentiment')

In [161]:
tf_vector, nb_model = model(train_features_df)

In [163]:
test_review = '''This film has everything you want in a team up super hero film from awesome action sequences to great characters but still has some noticeable flaws.

The fight choreography was great not because of john wick level action sequences but just really fun and enjoyable action scemes that really embrace comic book fun.
The villain loki is perfect as although he may have a generic motavation, the way clashes with the heroes ideologies and has huge amounts of personality made him unforgettable. 

One of my favourite things about this movie is the sense of urgency and dread, it really does feel like anything could happen and some thing could go wrong and this is due to some great writing and acting. another thing i love is that the characters work so well with each oher and are handled so well, the best avengers (not in power but in character complexity) are all focused on more than the more boring characters which is awesome as it feels no potential was lost in the characters and it shows they actually put thought into there characters.

This movie isnt perfect though as the cinematography which was dissapointing and felt really bland and ugly to look at as it felt like they dont use there unique set design.

Overall this movie still holds up as one of the best and most unforgettable mcu experiences but was still dissapionting in some aspects.'''
test_review

'This film has everything you want in a team up super hero film from awesome action sequences to great characters but still has some noticeable flaws.\n\nThe fight choreography was great not because of john wick level action sequences but just really fun and enjoyable action scemes that really embrace comic book fun.\nThe villain loki is perfect as although he may have a generic motavation, the way clashes with the heroes ideologies and has huge amounts of personality made him unforgettable. \n\nOne of my favourite things about this movie is the sense of urgency and dread, it really does feel like anything could happen and some thing could go wrong and this is due to some great writing and acting. another thing i love is that the characters work so well with each oher and are handled so well, the best avengers (not in power but in character complexity) are all focused on more than the more boring characters which is awesome as it feels no potential was lost in the characters and it sho

In [164]:
review_classifier(test_review, tf_vector, nb_model)

Given review shows positive sentiment


In [165]:
bad_test_review = '''Oh god! I've never seen an overrated film like Tees maar Khan. Even imbd had given it something like 2.5 out of 10 overacting, flop comedic scenes and an useless script just ruined the film completely. And J can't understand, what has happened to our Indian audience?Some of the people are even saying that "if you don't like this movie you have a terrible taste!" I mean are you being serious? What can I say these types of movies perfectly suits Indian audience. Anyone who thinks this movie is great first watch movies like wolf of Wall Street, django unchained, inception and then watch these films, you'll understand the difference between a good film and a n average film'''
bad_test_review

'Oh god! I\'ve never seen an overrated film like Tees maar Khan. Even imbd had given it something like 2.5 out of 10 overacting, flop comedic scenes and an useless script just ruined the film completely. And J can\'t understand, what has happened to our Indian audience?Some of the people are even saying that "if you don\'t like this movie you have a terrible taste!" I mean are you being serious? What can I say these types of movies perfectly suits Indian audience. Anyone who thinks this movie is great first watch movies like wolf of Wall Street, django unchained, inception and then watch these films, you\'ll understand the difference between a good film and a n average film'

In [166]:
review_classifier(bad_test_review, tf_vector, nb_model)

Given review shows negative sentiment
