In [1]:
#importing packages and libraries.
import nltk
import random
import collections
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import classify
from nltk.metrics import *
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [2]:
#importing and reading the data into a virtual spreadsheet.
#This code prints out a virtual spreadsheet showing the first 20 
#elements in the IMDB dataset.

import pandas as pd
imdb_data = pd.read_csv('IMDBDataset.csv')
print(imdb_data.shape)
imdb_data.head(20)



(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [4]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
#Incorporating instances into training and testing set.

#training for both review and sentiment
training_review = imdb_data.review[:40000]    #x_train
training_sents = imdb_data.sentiment[:40000]  #y_train

testing_review = imdb_data.review[40000:]     #x_test
testing_sents = imdb_data.sentiment[40000:]   #y_test

print(training_review.shape)
print(testing_sents.shape)

(40000,)
(10000,)


In [7]:
#Downloading 'movie_reviews' from the nltk collection
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/amad/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [8]:
#Downloading 'stopwords' from the nltk collection
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/amad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#In the code below, data from 'review' is assigned to 'review_str'
#which splits the data in 'review' into a string.
review_str = str(imdb_data.review)
review_strSplit = review_str.split()
print(review_strSplit)

['0', 'One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', '...', '1', 'A', 'wonderful', 'little', 'production.', '<br', '/><br', '/>The...', '2', 'I', 'thought', 'this', 'was', 'a', 'wonderful', 'way', 'to', 'spend', 'ti...', '3', 'Basically', "there's", 'a', 'family', 'where', 'a', 'little', 'boy', '...', '4', 'Petter', "Mattei's", '"Love', 'in', 'the', 'Time', 'of', 'Money"', 'is...', '...', '49995', 'I', 'thought', 'this', 'movie', 'did', 'a', 'down', 'right', 'good', 'job...', '49996', 'Bad', 'plot,', 'bad', 'dialogue,', 'bad', 'acting,', 'idiotic', 'di...', '49997', 'I', 'am', 'a', 'Catholic', 'taught', 'in', 'parochial', 'elementary...', '49998', "I'm", 'going', 'to', 'have', 'to', 'disagree', 'with', 'the', 'previou...', '49999', 'No', 'one', 'expects', 'the', 'Star', 'Trek', 'movies', 'to', 'be', 'high...', 'Name:', 'review,', 'Length:', '50000,', 'dtype:', 'object']


In [10]:
#Printing Stopwords
stopwords_english = stopwords.words('english')
print(stopwords_english)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In the code below, text pre-processing is performed on the IMDB dataset. Where i started by creating lists in an array "for_duplicate(for duplicate words in the dataset), delete_list(for alphanumeric values and html tags), movie_review(a list which prints after text preprocessing has been implied on the imdb dataset, this prints in a tokenized form.)".

In [11]:
for_duplicate = []
delete_list = []
movie_review = []



for x in review_strSplit:
    if x.isnumeric() is True:
        delete_list.append(x)
    elif x.isalpha() is False:
        delete_list.append(x)
    elif x in stopwords_english:
        delete_list.append(x)
    elif x not in stopwords_english:
        for_duplicate.append(x.lower())
        
        
for y in for_dups:
    if y not in movie_review:
        movie_review.append(y)

print(movie_review)

['one', 'reviewers', 'mentioned', 'a', 'wonderful', 'little', 'i', 'thought', 'way', 'spend', 'basically', 'family', 'boy', 'petter', 'time', 'movie', 'right', 'good', 'bad', 'idiotic', 'catholic', 'taught', 'parochial', 'going', 'disagree', 'no', 'expects', 'star', 'trek', 'movies', 'object']


In [12]:
#preparing the training set where data is divided into 70%, 30%.
from sklearn.model_selection import train_test_split
training_review,testing_review,training_sents,testing_sents = train_test_split(imdb_data['review'], imdb_data['sentiment'],test_size =0.3)


In [13]:
#encoding to transform data type of string into a numerical value
from sklearn.preprocessing import LabelEncoder
imdb_encode = LabelEncoder()
training_sents = imdb_encode.fit_transform(training_sents)
testing_sents = imdb_encode.fit_transform(testing_sents)

In [14]:
#converting a collection of text documents into numerical vectors
from sklearn.feature_extraction.text import TfidfVectorizer
F_vector = TfidfVectorizer(max_features = 50)
F_vector.fit(imdb_data['review'])

training_review_vector = F_vector.transform(training_review)
testing_review_vector = F_vector.transform(testing_review)

print(F_vector.vocabulary_)


{'one': 32, 'of': 30, 'the': 38, 'has': 16, 'that': 37, 'just': 25, 'you': 49, 'be': 7, 'they': 40, 'are': 4, 'as': 5, 'this': 41, 'is': 23, 'what': 45, 'with': 48, 'br': 8, 'about': 0, 'was': 44, 'and': 3, 'in': 22, 'from': 14, 'not': 29, 'for': 13, 'or': 33, 'to': 42, 'it': 24, 'on': 31, 'an': 2, 'all': 1, 'have': 17, 'so': 35, 'more': 27, 'but': 9, 'who': 47, 'out': 34, 'if': 21, 'can': 11, 'very': 43, 'he': 18, 'by': 10, 'his': 20, 'some': 36, 'when': 46, 'at': 6, 'her': 19, 'there': 39, 'movie': 28, 'film': 12, 'like': 26, 'good': 15}


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(training_review_vector,training_sents)

prediction = lr.predict(testing_review_vector)

#accuracy
print("accuracy score of logistic regression = ", accuracy_score(prediction,testing_sents)*100)


accuracy score of logistic regression =  65.9


In [17]:
from sklearn.metrics import classification_report
print(classification_report(testing_sents,prediction))
print(accuracy_score(testing_sents,prediction))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      7502
           1       0.66      0.67      0.66      7498

    accuracy                           0.66     15000
   macro avg       0.66      0.66      0.66     15000
weighted avg       0.66      0.66      0.66     15000

0.6604666666666666


In [16]:
#building classifier with Support Vector Machine SVM
from sklearn import model_selection, svm
from sklearn.metrics import classification_report
svm = svm.SVC(kernel = 'linear')
svm.fit(training_review_vector,training_sents)

prediction = svm.predict(testing_review_vector) #predicting labels on validation dataset

#accuracy
print("accuracy score of svm = ", accuracy_score(prediction,testing_sents)*100)

accuracy score of svm =  66.04666666666667


In [18]:
from sklearn.metrics import classification_report
print(classification_report(testing_sents,prediction))
print(accuracy_score(testing_sents,prediction))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      7502
           1       0.66      0.67      0.66      7498

    accuracy                           0.66     15000
   macro avg       0.66      0.66      0.66     15000
weighted avg       0.66      0.66      0.66     15000

0.6604666666666666
