In [1]:
# Import libraries

import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import csv
import os

In [2]:
#set base directory for dataset
base_dir = 'imdb_dataset/'

# Training dataset directories
train_path = os.path.join(base_dir,'train')
train_neg_path = os.path.join(train_path,'neg')
train_pos_path = os.path.join(train_path,'pos')

# Testing dataset directories
test_path = os.path.join(base_dir,'test')
test_neg_path = os.path.join(test_path,'neg')
test_pos_path = os.path.join(test_path, 'pos')

In [3]:
"""# Get size of negative & positive training data
# Negative reviews
path, dirs, files = next(os.walk(train_neg_path))
neg_file_count = len(files)
# Positive reviews
path, dirs, files = next(os.walk(train_pos_path))
pos_file_count = len(files)

neg_file_count, pos_file_count"""

'# Get size of negative & positive training data\n# Negative reviews\npath, dirs, files = next(os.walk(train_neg_path))\nneg_file_count = len(files)\n# Positive reviews\npath, dirs, files = next(os.walk(train_pos_path))\npos_file_count = len(files)\n\nneg_file_count, pos_file_count'

### IMPORT DATA

In [4]:
"""
imdb_data_preprocessing : Method to club negative and positive movie reviews and store them in the required format for further processing
    _dir : directory of the dataset to be processed
"""

def imdb_data_preprocessing(_dir, new_filename):
    indices = []
    text = []
    rating = [] # 0 - negative, 1 - positive
    
    i = 0
    
    # Positive movie reviews
    for filename in os.listdir(_dir+'/pos'):
        data = open(_dir+'/pos/'+filename, 'r', encoding="ISO-8859-1").read()
        indices.append(i)  
        text.append(data)
        rating.append(1)
        i = i+1
        
    # Negative movie reviews
    for filename in os.listdir(_dir+'/neg'):
        data = open(_dir+'/neg/'+filename, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(0)
        i = i+1
        
    # Creating a dataset and storing it in the provided directory
    Dataset = list(zip(indices, text,rating))
    np.random.shuffle(Dataset)     # Shuffling dataset for the the better accuracy
    df = pd.DataFrame(data=Dataset, columns = ['id', 'text', 'remark'])
    df.to_csv(_dir+'/'+new_filename, index=False, header = True)
    
    pass
    

#### Train Dataset

In [5]:
# Create training dataset by combining negative and positive reviews
train_dataset_filename = 'train_dataset.csv'
imdb_data_preprocessing(_dir=train_path, new_filename =train_dataset_filename )

In [6]:
# Import training dataset
training_data = pd.read_csv(os.path.join(train_path,train_dataset_filename))
training_data.head()

Unnamed: 0,id,text,remark
0,17076,What an insult to Olivia D'Abo who plays the f...,0
1,1507,"""Citizen X"" is the superbly told true story of...",1
2,6105,If the themes of The Girl From Missouri sound ...,1
3,20805,I doubt this will ever even be a cult film. I ...,0
4,8628,hello. i just watched this movie earlier today...,1


In [7]:
# Get the shape of the sample size available for training
training_data.shape

(25000, 3)

#### Test Dataset

In [8]:
# Create testing dataset by combining negative and positive reviews
test_dataset_filename = 'test_dataset.csv'
imdb_data_preprocessing(_dir=test_path,new_filename=test_dataset_filename)

In [9]:
# Import testing dataset
testing_data = pd.read_csv(os.path.join(test_path, test_dataset_filename))
testing_data.head()

Unnamed: 0,id,text,remark
0,10332,Just after having moved into his new cottage i...,1
1,4345,"Mankind's Self awakening is the theme of ""2001...",1
2,19717,Christopher Lambert attracted me to this movie...,0
3,8473,"One of the best romantic classic,teen deviyaan...",1
4,1372,Don't mind what this socially retarded person ...,1


In [10]:
# Get the shape of the testing data sample size
testing_data.shape

(25000, 3)

In [11]:
# Lets divide the traing and testing datasets into features and target (review) points
X_train, y_train = training_data['text'].values, training_data['remark'].values
X_test, y_test = testing_data['text'].values, testing_data['remark'].values

In [12]:
X_train[:1]

array(["What an insult to Olivia D'Abo who plays the film's heroine, Robin, to have Keanu Reeves appear so large on the box art of the film (and at least on recent reissues, to have only Reeves appear on the box), considering that she was the star. I realize that it is his name that will ultimately sell this long-forgotten After School Special, but at least give the woman some credit. <br /><br />Despite that, this has to be one of the worst teen sports-themed films that I have ever seen, and it strives very hard to add not only every teen and sports movie clichÃ© from the class warfare between the feuding gymnasts to the teen romance. And, in striving to somehow deliver itself as an amateur alternative of Flashdance (with the music in one of the warehouse dance scenes is even quite close to Michael Sembello's notable 'Maniac' which was made famous by Flashdance, or was it the other way around?). It includes similar dance sequences and worse yet, even the 80s dance and sports tradition

### TEXT PREPROCESSING

In [13]:
import re
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# REGEXs to remove unwanted patterns from the text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [15]:
def text_preprocessing(text):
    
    # convert text string to lower case string
    text = text.lower()
    # replace symbols with a space in text
    text = re.sub(REPLACE_BY_SPACE_RE," ", text)
    # replace unwanted symbols from text
    text = re.sub(BAD_SYMBOLS_RE,'', text)
    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    return text

In [16]:
# Process Training data text
X_train = [text_preprocessing(x) for x in X_train]

# Process Testing data text
X_test = [text_preprocessing(x) for x in X_test]

In [17]:
X_train[:1]

['insult olivia dabo plays films heroine robin keanu reeves appear large box art film least recent reissues reeves appear box considering star realize name ultimately sell longforgotten school special least give woman credit br br despite one worst teen sportsthemed films ever seen strives hard add every teen sports movie clich class warfare feuding gymnasts teen romance striving somehow deliver amateur alternative flashdance music one warehouse dance scenes even quite close michael sembellos notable maniac made famous flashdance way around includes similar dance sequences worse yet even 80s dance sports traditions corny danceoffs heroine antagonist one doubts successes abilities team saw trashin vert ramp joust rad bmx dancing prom although wasnt much competition rather fun example fact movie chock full unrealistic corniness somewhat homoerotic rolling clothes salvation army robin friend teambr br nonetheless film young girl comes rather poor background top massive need squeeze audien

##### Calculate frequency of words in text

In [18]:
def words_freq_counts(text):
    words_freq = {}
    for line in text:
        for word in line.split():
            if word not in words_freq:
                words_freq[word] = 1
            else:
                words_freq[word] = words_freq[word] + 1
    return words_freq

In [19]:
# training data words (term) frequency
words_counts = words_freq_counts(X_train)

In [20]:
# size of unique words in the training dataset
len(words_counts)

113007

In [21]:
# most common terms in the copora
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
most_common_words

[('br', 59067),
 ('movie', 41941),
 ('film', 37559),
 ('one', 25561),
 ('like', 19690),
 ('good', 14615),
 ('even', 12537),
 ('would', 12141),
 ('time', 11840),
 ('really', 11673)]

### BAG of WORDS Approach

In [22]:
DICT_SIZE = 200000
WORDS_TO_INDEX = {b[0]:a for a,b in enumerate(sorted(words_counts.items(),key = lambda x: x[1], reverse= True))}
INDEX_TO_WORDS = {b:a for a,b in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [23]:
"""
BAG_OF_WORDS creates a vector of tokens of the string
    Inputs:
        >> text: text to be vectorized
        >> words_to_index: list of indices of words
        dict_size: length of the words count
"""
def bag_of_words(text, words_to_index, dict_size):
    # create a zero vector equaling the size of words list
    result_vector = np.zeros(dict_size)
    for word in set(text.split()):
        if word in words_to_index:
            result_vector[words_to_index[word]] = 1
            
    return result_vector

In [24]:
# Apply the BOW approach to train and test datasets

In [25]:
from scipy import sparse as sp_sparse

In [26]:
X_train_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])

print('X_train shape ', X_train_bow.shape)
print('X_test shape ', X_test_bow.shape)

X_train shape  (25000, 200000)
X_test shape  (25000, 200000)


In [27]:
# lets calculate number of non-zero elements in a given row

row_ = X_train_bow[9].toarray()[0]

non_zero_elements = np.nonzero(row_)

non_zero_elements_count = len(non_zero_elements[0])

non_zero_elements_count

54

### TF-IDF Approach

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
def tf_idf_features(X_train,X_test):
    
    tf_idf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9,ngram_range=(1,2), token_pattern='(\S+)')
    features = tf_idf_vectorizer.fit(X_train)
    
    X_train = features.transform(X_train)
    X_test = features.transform(X_test)
    
    return X_train, X_test, tf_idf_vectorizer.vocabulary_
    

In [30]:
X_train_tfidf, X_test_tfidf, tfidf_vocab = tf_idf_features(X_train,  X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

### Prediction 

In [39]:
# We will be using Stochaistic stochastic gradient descent classifier
# We don't need to process out target data (y_train and y_test) as it is already in binary form

In [32]:
from sklearn.linear_model import SGDClassifier 

#### BOW apporach

In [33]:
classifier_bow = SGDClassifier(loss="hinge", penalty="l1", n_iter=20)
classifier_bow.fit(X_train_bow, y_train)
y_pred_bow = classifier_bow.predict(X_test_bow)



#### TF_IDF approach

In [34]:
classifier_tfidf = SGDClassifier(loss="hinge", penalty="l1", n_iter=20)
classifier_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)



### Evaluation

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
def accuracy_scores(y_, predicted):
    print('Accuracy Score: {}%'.format((accuracy_score(y_, predicted, normalize = False)*100)/len(y_)))

In [37]:
print('Bag-of-words')
accuracy_scores(y_test, y_pred_bow.round())

Bag-of-words
Accuracy Score: 84.928%


In [38]:
print('TF-IDF')
accuracy_scores(y_test, y_pred_tfidf.round())

TF-IDF
Accuracy Score: 87.58%
