In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_temp = pd.read_csv('IMDB Dataset.csv')  # Reading file

In [3]:
df = df_temp[0:15000]  # with 50000 records we we having resource limitations hence taken only 15000 rows

In [4]:
df.shape    # data set has 50,000 review

(15000, 2)

In [5]:
df.head()  # top 5 records

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text Preprocessing

In [6]:
df.review[1]  # here we can see our data is not processed ex. html tags are there, special character, case of words not uniform

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [7]:
df['sentiment'].value_counts() # Which shows data is approximately balanced

sentiment
negative    7609
positive    7391
Name: count, dtype: int64

In [8]:
df.isnull().sum()  # Here we can see data does not have null values

review       0
sentiment    0
dtype: int64

In [9]:
df.duplicated().sum()  # there are some duplicate records which we can drop

39

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.duplicated().sum()  # check to see whether duplicate data is dropped or not

0

# Basic Text Preprocessing  

In [12]:
 # Remove tags : will remove html tags from each record using python regex
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [13]:
df['review'] = df['review'].apply(remove_tags)  # applying  remove tags functions to dataset /  each X variable record

In [14]:
df.review[1]  # we can see html tags has been removed

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [15]:
# lowercase : function will lowercase all the words
df['review'] = df['review'].apply(lambda x:x.lower())   

In [16]:
import nltk
nltk.download('stopwords')  #importing the stopwords for a specific language i.e english in our case

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# remove stopwords
from nltk.corpus import stopwords
stop_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in stop_list]).apply(lambda x:" ".join(x))

In [18]:
# remove all special charcter 
import re
def remove_special_characters(text):
    # Defining a regular expression pattern to match special characters
    pattern = r'[^a-zA-Z0-9\s]'  # Removes anything that is not a letter, digit, or whitespace

    # Use sub() function will replace matches with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [19]:
df.review = df.review.apply(remove_special_characters) # apply function on dataframe

In [20]:
df.review[1]  # hence now we are good to go ahead from text preprocessing 

'wonderful little production filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen michael sheen has got polari voices pat too truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life realism really comes home little things fantasy guard which rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwells murals decorating every surface terribly well done'

In [21]:
x = df.iloc[:,0:1]   # selecting independent variable 
y = df['sentiment']  # selecting dependent variable i.e, target column

In [22]:
from sklearn.preprocessing import LabelEncoder  # as our taget column is nominal categorical we can use Label encoder
le = LabelEncoder()
y = le.fit_transform(y)

In [23]:
le.classes_   # Negative = 0 , Positive = 1

array(['negative', 'positive'], dtype=object)

# Random sampling 

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=7) 

In [30]:
x_train.shape

(11968, 1)

In [31]:
# Applying Bag of words using Countvectorizer function of sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()   # object of countvectorizer

In [32]:
x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [33]:
x_train_bow.shape  # for each record we have 92130 columns or feature using BOW technique

(11968, 92130)

# Modeling 

In [34]:
# Model 1
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train_bow,y_train)

In [35]:
# Prediction on testing data
y_pred = gnb.predict(x_test_bow)

In [36]:
#Model evaluation
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.6528566655529568

In [37]:
confusion_matrix(y_test,y_pred)

array([[1051,  425],
       [ 614,  903]], dtype=int64)

In [38]:
# Model 2 
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
rf1.fit(x_train_bow,y_train)

In [39]:
y_pred = rf1.predict(x_test_bow)
accuracy_score(y_test,y_pred)  # rf1 accuracy

0.8426328098897428

In [42]:
# rebuild count vectorizer object with bigram and reducing features as features will increase drastically hence computation limit
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)  # outof 92130 earlier we are only considering 5000 features and rebuid model
x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [43]:
#model train 
rf2 = RandomForestClassifier()
rf2.fit(x_train_bow,y_train)

In [44]:
y_pred = rf2.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.8329435349148012

In [45]:
# Here we can see random forest RF1 model giving highest accuracy hence we can proceed with this model
import pickle
with open('bow_text_classifier.pkl','wb') as file:
    pickle.dump(rf1,file)