In [3]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [4]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [5]:
df.shape

(50000, 2)

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df.duplicated().sum()

418

In [10]:
df.drop_duplicates(inplace=True)  #removing duplicates

In [11]:
df.shape

(49582, 2)

### Basic Preprocessing


In [12]:
# Remove Html tags

import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [13]:
df['review'] = df['review'].apply(remove_tags)

In [14]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [15]:
# Lowercasing

df['review'] = df['review'].apply(lambda x:x.lower())

In [16]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [17]:
# Remove stopwords

from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [18]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
49995,thought movie right good job. creative origina...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,catholic taught parochial elementary schools n...,negative
49998,going disagree previous comment side maltin on...,negative


In [19]:
X = df.iloc[:,0:1]
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
49995,thought movie right good job. creative origina...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,catholic taught parochial elementary schools n...
49998,going disagree previous comment side maltin on...


In [20]:
y = df['sentiment']
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [21]:
# encoding the column 'y' using LabelEncoder()

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [22]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [24]:
X_train.shape

(39665, 1)

### 1) Using Bag of Words

In [25]:
# Applying BoW (Bag of Words)  : countVectorizer means BoW


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000) # # Only top 5000 words

In [26]:
# using CountVectorizer (BoW) to convert text into numbers, because ML models cannot read text

X_train_bow = cv.fit_transform(X_train['review']).toarray()  

# cv.fit() reads only the training data and does two things: Finds all unique words (vocabulary) and Assigns each word an integer index

# transform() converts text → numbers
# toarray() -> Convert sparse matrix to normal array


X_test_bow = cv.transform(X_test['review']).toarray()

In [27]:
X_train_bow.shape

(39665, 5000)

In [28]:
# Using Gaussian Naive Bayes model 

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [29]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.7591005344358173

In [30]:
## Using Random forest 

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8485429061208026

### 2) Using Tfidf

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf = TfidfVectorizer(max_features=5000)

In [39]:
X_train_tfidf = tfidf.fit_transform(X_train['review'])
X_test_tfidf = tfidf.transform(X_test['review'])

In [40]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8470303519209438

### 3) Using average word2vec

In [41]:
import gensim

In [42]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [43]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))



# simple_preprocess(sent) is a function from Gensim that:

# Lowercases text

# Removes punctuation

# Tokenizes into words

# Removes very short/long words
    

In [46]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [47]:
model.build_vocab(story)

In [48]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(29412869, 30767745)

In [54]:
# computing the average Word2Vec vector for a given document

# def document_vector(doc):
#     doc = [word for word in doc.split() if word in model.wv.index_to_key]  # keeps only words present in the Word2Vec vocabulary
#     return np.mean(model.wv[doc], axis=0) # Fetch Word2Vec vectors for each word and Compute average vector for the document


def document_vector(doc, embedding_size=100):   # embedding_size means vector size representing each words
    """
    Convert a document (string) into an average Word2Vec vector.
    If no words in the doc exist in the Word2Vec vocabulary,
    return a zero vector.
    """
    # Keep only words in Word2Vec vocabulary
    words = [word for word in doc.split() if word in model.wv.index_to_key]
    
    if len(words) == 0:
        # Return zero vector if no words found
        return np.zeros(embedding_size)
    
    # Average word vectors
    return np.mean(model.wv[words], axis=0)


In [55]:
document_vector(df['review'].values[0])  # average word2vec of first review.

array([ 0.10569246, -0.29917637,  0.22497748, -0.05106837,  0.5631405 ,
       -0.56254596,  0.05541799,  0.3742676 ,  0.5830009 , -0.19659692,
       -0.33777073, -0.16442202, -0.06648852,  0.30912852,  0.09876566,
        0.32028487, -0.02606952, -0.2680193 , -0.28195903, -0.39593816,
        0.16891989,  0.29039788,  0.09844672,  0.2678447 , -0.40196356,
        0.07350063, -0.34862605, -0.04920228, -0.37749872, -0.13915886,
        0.3684938 , -0.34655502, -0.07838881, -0.39667088,  0.21634685,
        0.472916  , -0.4461517 , -0.04992766,  0.24180292, -0.38746423,
       -0.2453066 , -0.07371633, -0.27450222, -0.4045297 ,  0.27622136,
       -0.15566418,  0.00200464, -0.1926327 , -0.17660911,  0.29355547,
        0.4278573 ,  0.09131151,  0.0489454 , -0.06064026, -0.28838915,
        0.4257367 , -0.06952546,  0.3317662 , -0.22169042,  0.04193464,
        0.02641035, -0.1549037 ,  0.06658787,  0.2775638 , -0.23560472,
        0.0969287 , -0.0178022 , -0.27895388,  0.06383897, -0.13

In [56]:
from tqdm import tqdm

# tqdm is just a progress bar for loops in Python. It does not affect the calculations, only shows how much of the loop has finished.

In [57]:
X = [] # here 'X' is the python list
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████| 49582/49582 [42:03<00:00, 19.65it/s] 


In [58]:
X = np.array(X)  # converting the python list i.e 'X' into array

In [64]:
X

array([[ 0.10569246, -0.29917637,  0.22497748, ..., -0.2465914 ,
        -0.33341751,  0.41430464],
       [ 0.24351856,  0.41437468, -0.2817516 , ...,  0.13972853,
         0.11136544,  0.77562273],
       [ 0.11079583, -0.24115168, -0.04286304, ..., -0.43066773,
        -0.14971153,  0.41568685],
       ...,
       [-0.03292271, -0.255492  ,  0.0800926 , ..., -0.61150509,
        -0.34861091,  0.26291674],
       [ 0.11890772,  0.18190975, -0.05160801, ..., -0.14937472,
        -0.14803517,  0.45671141],
       [ 0.03877564,  0.00887587,  0.08535957, ..., -0.2071242 ,
        -0.62956154,  0.18513174]])

In [59]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [60]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [61]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [63]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8191993546435414