In [1]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
#Extracting the data
imdb_data =pd.read_csv("/home/ken/Desktop/imdb.csv", engine='python')
print (imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [3]:
#Splitting the dataset
#train dataset
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.sentiment[:40000]

#test dataset
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]
print (train_reviews.shape, train_sentiments.shape)
print (test_reviews.shape, test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


In [5]:
#Cleaning the corpus
from bs4 import BeautifulSoup

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

In [6]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

In [8]:
#Stemming the text
import nltk.data
from nltk.stem.porter import PorterStemmer

def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

In [6]:
import nltk.data 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [10]:
#for Tokenizing the text
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer =ToktokTokenizer()

#set stopwords to english
from nltk.corpus import stopwords


english_stop_words=stopwords.words('english')
print(english_stop_words)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in english_stop_words]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in english_stop_words]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]

'one review ha mention watch 1 Oz episod youll hook right thi exactli happen meth first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

In [12]:
#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45005]

'read review watch thi piec cinemat garbag took least 2 page find somebodi els didnt think thi appallingli unfunni montag wasnt acm humour 70 inde ani era thi isnt least funni set sketch comedi ive ever seen itll till come along half skit alreadi done infinit better act monti python woodi allen wa say nice piec anim last 90 second highlight thi film would still get close sum mindless drivelridden thi wast 75 minut semin comedi onli world semin realli doe mean semen scatolog humour onli world scat actual fece precursor joke onli mean thi handbook comedi tit bum odd beaver niceif pubesc boy least one hand free havent found playboy exist give break becaus wa earli 70 way sketch comedi go back least ten year prior onli way could even forgiv thi film even made wa gunpoint retro hardli sketch clown subtli pervert children may cut edg circl could actual funni come realli quit sad kept go throughout entir 75 minut sheer belief may save genuin funni skit end gave film 1 becaus wa lower scoreand

In [13]:
#Convering the text document to numerical vectors
#or Bag of Words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names


BOW_cv_train: (40000, 6209089)
BOW_cv_test: (10000, 6209089)


In [14]:
#TfidVectorizer
#to convert raw document a to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

Tf =TfidfVectorizer()
#transformed train reviews
tv_train_reviews =Tf.fit_transform(norm_train_reviews)
tv_test_reviews =Tf.transform(norm_test_reviews)
print ('Tfidf_train',tv_train_reviews.shape)
print ('Tfidf_test',tv_test_reviews.shape)

Tfidf_train (40000, 156136)
Tfidf_test (10000, 156136)


In [15]:
#Binarize the sentiment data
from sklearn.preprocessing import LabelBinarizer

le =LabelBinarizer()
sentiment_data =le.fit_transform(imdb_data['sentiment'])
print (sentiment_data.shape)

(50000, 1)


In [16]:
#Splitting the sentiment data
train_sentiments =sentiment_data[:40000]
test_sentiments =sentiment_data[40000:]
print (train_sentiments)
print (test_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [18]:
#training the model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=500, C=0.05, random_state=42)
#Fitting the model for Bag of Words
lr.fit(cv_train_reviews, train_sentiments)
print ("Accuracy for bag of words",accuracy_score(test_sentiments, lr.predict(cv_test_reviews)))
       
lr.fit(tv_train_reviews, train_sentiments)
print ("Accuracy for tfidf", accuracy_score(test_sentiments, lr.predict(tv_test_reviews)))

Accuracy for bag of words 0.7268
Accuracy for tfidf 0.8587


In [22]:
#Second model
#Linear SVM-Classifier
from sklearn.svm import LinearSVC
svm =LinearSVC(C=0.01, random_state=42, max_iter=500)

#Fitting the model for bag of words
svm.fit(cv_train_reviews, train_sentiments)
print ("Accuracy for bag of words",accuracy_score(test_sentiments, svm.predict(cv_test_reviews)))

svm.fit(tv_train_reviews, train_sentiments)
print ("Accuracy for tfidf", accuracy_score(test_sentiments, svm.predict(tv_test_reviews)))

Accuracy for bag of words 0.7211
Accuracy for tfidf 0.8697
