In [1]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#reading data from csv file
import pandas as pd
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,One of the other reviewers has mentioned that ...,positive
1,1,A wonderful little production. <br /><br />The...,positive
2,2,I thought this was a wonderful way to spend ti...,positive
3,3,Basically there's a family where a little boy ...,negative
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
#selecting first 5000 reviews only
df2 = df.copy()

In [4]:
#importing required libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
#initializing word net lemmatizer
lemmatizer = WordNetLemmatizer()

In [6]:
#data preprocessing
corpus = []
for i in range(len(df2['review'])):
    review = re.sub('[^a-zA-Z]',' ', df2['review'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words("english")]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
#checking length of corpus
len(corpus)

5000

In [8]:
#importing and initializing count vectorizer for bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features = 2500, ngram_range = (1,2), binary = True)

In [9]:
#separating input and output data
X = corpus
y = df2['sentiment'].map({'positive':1, 'negative':0})

In [10]:
#splitting data into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)

In [11]:
X_train

['shall begin disclaimer movie recommended anyone lack interest never played ff game watching movie relies audience knowledge character game convey story plot element subtly homework watching wonderful piece cg film promise much better br br mind film spectacular cg sequence ever witnessed whole experience felt like extra long fmv sequence game steroid yeah attention detail scene especially heavy action oriented one impeccable left sense awe br br believe soundtrack simplified help audience focus animation quality music familiar ff story background music surprise anyone although timing placement soundtrack original accompany scene mood point music simply enhances animation br br played ff thoroughly enjoyed piece art square enix feel scene choreographed organized like dance br br short enjoyed music game final fantasy film blow water unfortunate majority experienced goodness known ff playstation pc watching movie allow exponentially greater experience br br finally want make note quali

In [12]:
#transforming X_train and X_test into bag of words
X_train_c = cv.fit_transform(X_train)
X_test_c = cv.transform(X_test)

In [13]:
X_train_c.shape
X_test_c.shape

(1000, 2500)

In [14]:
#importing initializing and fitting MultinomialNB to training data
from sklearn.naive_bayes import MultinomialNB
mod = MultinomialNB().fit(X_train_c.toarray(), y_train)

In [15]:
#making predictions and checking accuracy
from sklearn.metrics import accuracy_score, classification_report
y_preds = mod.predict(X_test_c)
accuracy = round(accuracy_score(y_test, y_preds),2)
accuracy

0.86

In [16]:
#printing classification report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       502
           1       0.86      0.87      0.86       498

    accuracy                           0.86      1000
   macro avg       0.86      0.86      0.86      1000
weighted avg       0.86      0.86      0.86      1000



In [17]:
#implementing tfidf vectorizer
tf = TfidfVectorizer(max_features = 2500, ngram_range = (1,2), binary = True)
X_train_t = tf.fit_transform(X_train)
X_test_t = tf.transform(X_test)

In [18]:
#fitting MultinomialNB on training data
mod2 = MultinomialNB().fit(X_train_t, y_train)

In [19]:
#making predictions on test data
y_preds2 = mod2.predict(X_test_t)

In [20]:
#checking accuracy 
accuracy = round(accuracy_score(y_test, y_preds2),2)
accuracy

0.86

In [21]:
#classification report
print(classification_report(y_test, y_preds2))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       502
           1       0.86      0.86      0.86       498

    accuracy                           0.86      1000
   macro avg       0.86      0.86      0.86      1000
weighted avg       0.86      0.86      0.86      1000



### Word2Vec

In [22]:
#importing required libraries
import gensim 
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [23]:
#creating list of words
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for w in sent_token:
        words.append(simple_preprocess(w))
    

In [24]:
#fitting word2vec on list of words
from gensim.models import Word2Vec
model = Word2Vec(words, window = 10, min_count = 2)

In [25]:
#all words in out model
model.wv.index_to_key

['br',
 'movie',
 'film',
 'one',
 'like',
 'time',
 'good',
 'character',
 'get',
 'story',
 'would',
 'even',
 'make',
 'see',
 'really',
 'well',
 'scene',
 'much',
 'bad',
 'first',
 'people',
 'also',
 'way',
 'great',
 'show',
 'made',
 'thing',
 'life',
 'go',
 'could',
 'think',
 'watch',
 'know',
 'plot',
 'never',
 'actor',
 'look',
 'little',
 'seen',
 'love',
 'many',
 'two',
 'year',
 'say',
 'acting',
 'best',
 'end',
 'ever',
 'come',
 'take',
 'man',
 'work',
 'better',
 'still',
 'something',
 'want',
 'part',
 'lot',
 'real',
 'director',
 'back',
 'watching',
 'performance',
 'give',
 'find',
 'woman',
 'though',
 'going',
 'old',
 'new',
 'guy',
 'play',
 'funny',
 'nothing',
 'girl',
 'day',
 'actually',
 'role',
 'another',
 'feel',
 'horror',
 'every',
 'point',
 'minute',
 'pretty',
 'star',
 'world',
 'around',
 'quite',
 'comedy',
 'fact',
 'thought',
 'young',
 'enough',
 'must',
 'big',
 'cast',
 'got',
 'however',
 'long',
 'line',
 'family',
 'action',
 's

In [26]:
#count of all words
model.corpus_count

5000

In [27]:
#most similar words
model.wv.similar_by_word('scene')

[('sequence', 0.84124356508255),
 ('action', 0.8256639838218689),
 ('rather', 0.8129609823226929),
 ('twist', 0.8051806092262268),
 ('shot', 0.8049049377441406),
 ('cadaver', 0.8005093336105347),
 ('injecting', 0.7940499782562256),
 ('kahn', 0.7916387319564819),
 ('plot', 0.7885647416114807),
 ('dialogue', 0.7862266302108765)]

In [28]:
#function for average word2vec
import numpy as np
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis = 0)

In [29]:
#applying average word2 vec on data
from tqdm import tqdm
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:34<00:00, 146.58it/s]


In [30]:
#creating numpy array of input data
X_new = np.array(X)

In [31]:
#checking shape
X_new.shape

(5000, 100)

In [32]:
#splitting data into train and test data
X_trainw, X_testw, y_trainw, y_testw = train_test_split(X_new, y, test_size = 0.2, random_state = 3)

In [33]:
#fitting random forest classifier on training data
from sklearn.ensemble import RandomForestClassifier
mod3 = RandomForestClassifier().fit(X_trainw, y_trainw)

In [34]:
#making predictions on test data
preds = mod3.predict(X_testw)

In [35]:
#accuracy on test data
round(accuracy_score(y_testw, preds),2)

0.77

In [36]:
#classification report
print(classification_report(y_testw, preds))

              precision    recall  f1-score   support

           0       0.75      0.78      0.76       477
           1       0.79      0.76      0.78       523

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000



In [37]:
#importing and initializing knn classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [38]:
#fitting knn model on training data
knn.fit(X_trainw, y_trainw)

KNeighborsClassifier()

In [39]:
#making predictions on test data
knn_preds = knn.predict(X_testw)

In [40]:
#accuracy
round(accuracy_score(y_testw, knn_preds),2)

0.68

In [41]:
#classification report
print(classification_report(y_testw, knn_preds))

              precision    recall  f1-score   support

           0       0.65      0.72      0.69       477
           1       0.72      0.65      0.68       523

    accuracy                           0.68      1000
   macro avg       0.69      0.69      0.68      1000
weighted avg       0.69      0.68      0.68      1000

