In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import re

In [2]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup 
import nltk
nltk.download('punkt') 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#importing training dataset
train_dataset=pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [4]:
train_dataset.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
train_dataset.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [6]:
#removing html tags
#removing numbers
#words from upper case to lower case and split them
#removing stopwords
def clean_review_method(review):
    review_text=BeautifulSoup(review,'lxml').get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words=review_text.lower().split()
    ps=PorterStemmer()
    #using set to spped up the process
    stops = set(stopwords.words("english"))
    words = [ps.stem(w) for w in words if not w in stops]
    return( " ".join( words ))   
  

In [7]:
#size of dataset
datasize=len(train_dataset["review"])


In [8]:
datasize

25000

In [9]:
#calling the clean_review method
corpus=[]
for review in train_dataset['review']:
    corpus.append(clean_review_method(review))
    

In [10]:
corpus[0]

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

In [11]:
#creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(analyzer="word", preprocessor=None, tokenizer=None, stop_words=None, max_features=5000)
X=cv.fit_transform(corpus).toarray()

In [12]:
train_dataset.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [13]:
y=train_dataset.iloc[:,1].values

In [14]:
y

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [15]:
#splitting the data set into trainig and test set
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)




In [16]:
# initialize a random forest classifier with 100 trees
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
rf_classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0) 

# Use cross validation to evaluate the performance of Random Forest
rf_classifier_error = 1 - cross_val_score(rf_classifier, X, train_dataset['sentiment'], 
                                   cv=5, scoring='accuracy', n_jobs=-1).mean()
print('Random Forest training error: {:.4}'.format(rf_classifier_error))

Random Forest training error: 0.1629


In [17]:
#fitting random forest to the trainig set
classifier= RandomForestClassifier(n_estimators = 100) 
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
#importing test dataset
test_dataset= pd.read_csv("testData.tsv", header=0, delimiter="\t",quoting=3 )

In [19]:
test_dataset.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [20]:
test_dataset.shape

(25000, 2)

In [21]:
#cleaning test data set
corpus_test=[]
for review in test_dataset['review']:
    corpus_test.append(clean_review_method(review))

In [22]:
#creating bagofwords for test dataset
test_feature=cv.fit_transform(corpus_test).toarray()

In [23]:
result = classifier.predict(test_feature)

In [24]:
classifier.score(X_test,y_test)

0.836

In [25]:
#Creating a Submission of Random Forest
Bagofwords_submission=pd.DataFrame( data={"id":test_dataset["id"], "sentiment":result} )

In [26]:
Bagofwords_submission.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )