In [1]:
# importing dataset collection libraries
import pandas as pd
import numpy as np
import glob, os, string, re

# importing Natural Language Processing Libraries
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer

# impoting Evaluvation Metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# importing classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
# importing dataset
from os import listdir
from os.path import isfile, join

onlyfiles = [f for f in listdir("aclImdb/train/pos") if isfile(join("aclImdb/train/pos", f))]

In [3]:
len(onlyfiles)

12500

In [4]:
#extracting names
import os
name = []

for j in onlyfiles:
    os.path.splitext(j)
    name.append(os.path.splitext(j)[0])
    

In [5]:
len(name)

12500

In [6]:
#splitting movie id and rating we get 
movieId = []
rating = []
for i in name:
    movieId.append(i.rsplit( "_", 1 )[ 0 ])
    rating.append(i.rsplit( "_", 1 )[ 1 ])

In [7]:
#reading all the reviews for the dataframe
review =[]
for path in onlyfiles:
    txt = open("aclImdb/train/pos/"+path, encoding="utf8")
    review.append(txt.read())

In [8]:
#creating dataframe
movie_dict = {"review": review, "target":1}
movie_df = pd.DataFrame(data = movie_dict)

In [9]:
movie_df.head()

Unnamed: 0,review,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [9]:
#doing same for negatives
from os import listdir
from os.path import isfile, join

onlyfiles = [f for f in listdir("aclImdb/train/neg") if isfile(join("aclImdb/train/neg", f))]

In [10]:
import os
name = []

for j in onlyfiles:
    os.path.splitext(j)
    name.append(os.path.splitext(j)[0])
    

In [11]:
review =[]
for path in onlyfiles:
    txt = open("aclImdb/train/neg/"+path, encoding="utf8")
    review.append(txt.read())

In [12]:
movie_dict = {"review": review, "target":0}
movie_df_neg = pd.DataFrame(data = movie_dict)

In [13]:
movie_df_neg.head()

Unnamed: 0,review,target
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [14]:
#merging both datasets
movie_df = movie_df_neg.append(movie_df, ignore_index = True)

In [15]:
movie_df.head()


Unnamed: 0,review,target
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [16]:
movie_df.tail()

Unnamed: 0,review,target
24995,"Seeing as the vote average was pretty low, and...",1
24996,"The plot had some wretched, unbelievable twist...",1
24997,I am amazed at how this movie(and most others ...,1
24998,A Christmas Together actually came before my t...,1
24999,Working-class romantic drama from director Mar...,1


In [17]:
 import nltk
 nltk.download('stopwords')
 nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\averd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\averd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
#data preprocessing function :
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))

def text_prep(text):
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [lemma.lemmatize(text, pos='v') for text in text.lower().split() if text not in stops] 
    text = " ".join(text)
    return (text)

In [19]:
#preprocessing training data
movie_df['prep_review'] = movie_df['review'].apply(lambda x: text_prep(x))
movie_df[['prep_review', 'target']].head()

Unnamed: 0,prep_review,target
0,story man unnatural feel pig start open scene ...,0
1,airport 77 start brand new luxury 747 plane lo...,0
2,film lack something couldnt put finger first c...,0
3,sorry everyone know suppose art film wow hand ...,0
4,little parent take along theater see interiors...,0


In [20]:
#doing it for testing data now 
onlyfiles = [f for f in listdir("aclImdb/test/pos") if isfile(join("aclImdb/test/pos", f))]
review =[]
for path in onlyfiles:
    txt = open("aclImdb/test/pos/"+path, encoding="utf8")
    review.append(txt.read())
    
movie_dict = {"review": review, "target":1}
movie_df_test = pd.DataFrame(data = movie_dict)

movie_df_test


Unnamed: 0,review,target
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
12495,I was extraordinarily impressed by this film. ...,1
12496,"Although I'm not a golf fan, I attended a snea...",1
12497,"From the start of ""The Edge Of Love"", the view...",1
12498,"This movie, with all its complexity and subtle...",1


In [21]:
onlyfiles = [f for f in listdir("aclImdb/test/neg") if isfile(join("aclImdb/test/neg", f))]
review =[]
for path in onlyfiles:
    txt = open("aclImdb/test/neg/"+path, encoding="utf8")
    review.append(txt.read())
    
movie_dict = {"review": review, "target":0}
movie_df_test_neg = pd.DataFrame(data = movie_dict)

movie_df_test_neg

Unnamed: 0,review,target
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
...,...,...
12495,I occasionally let my kids watch this garbage ...,0
12496,When all we have anymore is pretty much realit...,0
12497,The basic genre is a thriller intercut with an...,0
12498,Four things intrigued me as to this film - fir...,0


In [22]:
movie_df_test = movie_df_test_neg.append(movie_df_test, ignore_index = True)

In [23]:
movie_df_test.head()


Unnamed: 0,review,target
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [24]:
movie_df_test.tail()

Unnamed: 0,review,target
24995,I was extraordinarily impressed by this film. ...,1
24996,"Although I'm not a golf fan, I attended a snea...",1
24997,"From the start of ""The Edge Of Love"", the view...",1
24998,"This movie, with all its complexity and subtle...",1
24999,I've seen this story before but my kids haven'...,1


In [25]:
movie_df_test['prep_review'] = movie_df_test['review'].apply(lambda x: text_prep(x))
movie_df_test[['prep_review', 'target']].head()

Unnamed: 0,prep_review,target
0,mr costner drag movie far longer necessary asi...,0
1,example majority action film generic bore ther...,0
2,first hate moronic rappers couldnt act gun pre...,0
3,even beatles could write songs everyone like a...,0
4,brass picture movies fit word really somewhat ...,0


In [26]:
#now vectorizing train data 
tfidf = TfidfVectorizer()

x_train = tfidf.fit_transform(movie_df['prep_review'])
y_train = movie_df['target']

In [27]:
#vectorizing test data


x_test = tfidf.transform(movie_df_test['prep_review'])
y_test = movie_df_test['target']

In [28]:
# now training prediction classifiers
# Logistic Regression
# Linear SVC Classifier
# Ada Boost Classifier
# Naive Bayes Classifier
# Random Forest Classifier
print(x_train.shape, x_test.shape)

(25000, 109496) (25000, 109496)


In [29]:
#function to fit various classification models
def model (classifier , x_train, y_train, x_test, y_test):
    cmodel = classifier
    cmodel.fit(x_train, y_train)
    cmodel_pred = cmodel.predict(x_test)
    return [cmodel.score(x_train , y_train) , accuracy_score(y_test, cmodel_pred), classification_report(y_test, cmodel_pred), confusion_matrix(y_test, cmodel_pred)]

In [30]:

classifiers = [LogisticRegression() , LinearSVC(), AdaBoostClassifier(n_estimators = 100), MultinomialNB(), RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs = -1)]
names = ["Logistic Regression" , "SVM", "AdaBoost", "Naive Bayes", "Random Forests"]
for i in range(len(classifiers)):
    [score, accuracy, report, matrix] = model(classifiers[i], x_train, y_train, x_test, y_test )
    print('*****************************', names[i], '********************************')
    print("Classifier Score = ",  score)
    print("Accuracy score = ", accuracy)
    print("Classification Report \n  ")
    print(report)
    print("Confusion Matrix \n ")
    print(matrix)
    print('*************************************************************************** \n \n')

***************************** Logistic Regression ********************************
Classifier Score =  0.93488
Accuracy score =  0.88124
Classification Report 
  
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Confusion Matrix 
 
[[10987  1513]
 [ 1456 11044]]
*************************************************************************** 
 

***************************** SVM ********************************
Classifier Score =  0.9926
Accuracy score =  0.871
Classification Report 
  
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     12500
           1       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg     