In [17]:
import numpy as np 
import pandas as pd 
df = pd.read_csv('IMDB.csv')
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [19]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2))
text_count_matrix = tfidf.fit_transform(df.review)

In [21]:
#splitting the complete dataset in test and training dataset:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(text_count_matrix, df.sentiment, test_size=0.30, random_state=2)

In [22]:
#converting the sentiments (positive and negatives) to 1 and 0. 
y_train = (y_train.replace({'positive': 1, 'negative': 0})).values
y_test = (y_test.replace({'positive': 1, 'negative': 0})).values

In [23]:
# let's use Naive Bayes classifier and fit our model:
from sklearn.naive_bayes import MultinomialNB 
MNB = MultinomialNB()
MNB.fit(x_train, y_train)
#4. Evaluating the model
from sklearn import metrics
accuracy_score = metrics.accuracy_score(MNB.predict(x_test), y_test)
print("accuracy_score without data pre-processing = " + str('{:04.2f}'.format(accuracy_score*100))+" %")

 #lemmatized_sentence = " ".join(lemmatizer.lemmatize(token) for token in word_tokens if token not in stop_words)

accuracy_score without data pre-processing = 89.17 %


In [24]:
#let's investigate what kind of special characters and language is used by the reviewers to review the content. 
#we can observe some html tags
#use of parenthesis
#punctuation (apostrophy, '' e.t.c)
import re
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()
processed_review = []
single_review = "string to iniialize <br /> my email id is charilie@waoow.com. You can also reach to me at charlie's "
reviews = df.review
for review in range(0,50000):
    single_review = df.loc[review,'review']
    #start processing the single_review #removing html tags:
    single_review = re.sub('<.*?>',' ',single_review)
    #removing special characters (punctuation) '@,!' e.t.c.
    single_review = re.sub('\W',' ',single_review)
    #removing single characters
    single_review = re.sub('\s+[a-zA-Z]\s+',' ', single_review)
    #substituting multiple spaces with single space
    single_review = re.sub('\s+',' ', single_review)
    #removing stop words#word_tokens = []
    word_tokens = word_tokenize(single_review)
    #lemmatization
    filtered_sentence = []
    #filtered_sentence.append([w for w in word_tokens if w not in stop_words])
    filtered_sentence2 = " ".join([w for w in word_tokens if w not in stop_words])
    #compile all the sentences to make a complete dictionary of processed reviews
    processed_review.append(filtered_sentence2)
print(processed_review[10])

Phil Alien one quirky films humour based around oddness everything rather actual punchlines At first odd pretty funny movie progressed find jokes oddness funny anymore Its low budget film thats never problem pretty interesting characters eventually lost interest imagine film would appeal stoner currently partaking For something similar better try Brother another planet


In [25]:
text_count_matrix2 = tfidf.fit_transform(processed_review)
X_train, X_test, Y_train, Y_test = train_test_split(text_count_matrix2, df.sentiment, test_size=0.30, random_state=2)
Y_train = (Y_train.replace({'positive': 1, 'negative': 0})).values
Y_test = (Y_test.replace({'positive': 1, 'negative': 0})).values
MNB.fit(X_train, Y_train)
#4. Evaluating the model
accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print('Accuracy after preprocessing using Naivebayes',str('{:04.2f}'.format(accuracy_score*100))+" %")
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report: \n", classification_report(Y_test, MNB.predict(X_test),target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, MNB.predict(X_test)))

Accuracy after preprocessing using Naivebayes 89.28 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.89      0.89      0.89      7499
    Positive       0.89      0.89      0.89      7501

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

Confusion Matrix: 
 [[6689  810]
 [ 798 6703]]


In [26]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
SGDC = SGDClassifier()
SGDC.fit(X_train, Y_train)
predict = SGDC.predict(X_test)
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Stocastic Gradient Classifier accuracy = " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Stocastic Gradient Classifier accuracy = 90.08 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.92      0.88      0.90      7499
    Positive       0.89      0.92      0.90      7501

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Confusion Matrix: 
 [[6609  890]
 [ 598 6903]]


In [27]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC()
LSVC.fit(X_train, Y_train)
accuracy_score = metrics.accuracy_score(LSVC.predict(X_test), Y_test)
print("Linear SVC accuracy = " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, LSVC.predict(X_test),target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, LSVC.predict(X_test)))

Linear SVC accuracy = 91.13 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.92      0.90      0.91      7499
    Positive       0.90      0.92      0.91      7501

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000

Confusion Matrix: 
 [[6746  753]
 [ 578 6923]]


In [28]:
LR = LogisticRegression()
LR.fit(X_train, Y_train)
predict = LR.predict(X_test)
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Logistic Regression Accuracy = " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Logistic Regression Accuracy = 89.78 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.88      0.90      7499
    Positive       0.89      0.91      0.90      7501

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Confusion Matrix: 
 [[6611  888]
 [ 645 6856]]


In [29]:
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
predict = clf_gini.predict(X_test)
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Decision Tree Classifier Accuracy = " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Decision Tree Classifier Accuracy = 67.89 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.80      0.48      0.60      7499
    Positive       0.63      0.88      0.73      7501

    accuracy                           0.68     15000
   macro avg       0.71      0.68      0.67     15000
weighted avg       0.71      0.68      0.67     15000

Confusion Matrix: 
 [[3597 3902]
 [ 914 6587]]


In [30]:
from sklearn.ensemble import VotingClassifier
models = [('lr',LogisticRegression()),('MNB',MultinomialNB()),('SGDC',SGDClassifier()),('LSVC',LinearSVC())]
ensemble = VotingClassifier(estimators=models)
ensemble.fit(X_train, Y_train)
predict =ensemble.predict(X_test)
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Voting Classifier Accuracy=" + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Voting Classifier Accuracy=90.55 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.90      0.90      7499
    Positive       0.90      0.91      0.91      7501

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000

Confusion Matrix: 
 [[6751  748]
 [ 670 6831]]


In [31]:
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 5, criterion="entropy")  
classifier.fit(X_train, Y_train)  
Y_pred= classifier.predict(X_test)  
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Random Forest Classifier Accuracy = " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Random Forest Classifier Accuracy = 90.55 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.90      0.90      7499
    Positive       0.90      0.91      0.91      7501

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000

Confusion Matrix: 
 [[6751  748]
 [ 670 6831]]


In [32]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train, Y_train)
Y_pred= model.predict(X_test)
accuracy_score = metrics.accuracy_score(predict, Y_test)
print("Gradient Boost Regressor Accuracy " + str('{:04.2f}'.format(accuracy_score*100))+" %")
print("Classification Report: \n", classification_report(Y_test, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(Y_test, predict))

Gradient Boost Regressor Accuracy 90.55 %
Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.90      0.90      7499
    Positive       0.90      0.91      0.91      7501

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000

Confusion Matrix: 
 [[6751  748]
 [ 670 6831]]
