##  Importing all necessary libraries

In [424]:
import pandas as pd     
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
# nltk.download()
from nltk.corpus import stopwords # Import the stop word list
from sklearn.model_selection import train_test_split

## Reading the data (Training & Testing data)

In [486]:
main_train_data = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
main_test_data = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)


X_train, X_test, Y_train, Y_test = train_test_split(pd.DataFrame(main_train_data["review"]), \
                                                    pd.Series(main_train_data["sentiment"]), \
                                                    test_size = 0.3, \
                                                    random_state=42)

train_data = pd.DataFrame(X_train).join(pd.DataFrame(Y_train))
test_data = pd.DataFrame(X_test).join(pd.DataFrame(Y_test))

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# print(main_train_data.head())

print(train_data.head())
print(test_data.head())
print(main_test_data.head())

                                              review  sentiment
0  "I saw this pilot when it was first shown, and...          1
1  "Now either you like Mr Carrey's humour or you...          1
2  "This motion picture has a steady, haunting pa...          1
3  "In my opinion, this movie's title should be c...          0
4  "Why, oh, why won't they learn? When you've go...          0
                                              review  sentiment
0  "I read that \"There's a Girl in My Soup\" cam...          0
1  "This film pulls you in from the get-go becaus...          1
2  "From the awful death scenes to guns that fire...          0
3  "I saw that movie few days ago. This movie is ...          1
4  "For a film with so much promise it was disapp...          0
           id                                             review
0  "12311_10"  "Naturally in a film who's main themes are of ...
1    "8348_2"  "This movie is a disaster within a disaster fi...
2    "5828_4"  "All in all, this is a

In [487]:
print(train_data.shape)
print(test_data.shape)
print(main_test_data.shape)

(17500, 2)
(7500, 2)
(25000, 2)


In [488]:
print(train_data.columns.values)
print(test_data.columns.values)
print(main_test_data.columns.values)

['review' 'sentiment']
['review' 'sentiment']
['id' 'review']


In [489]:
# train_data['review'][0]

# gives 

# '"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call..."'

## PreProcessing data for one item. 
###### beautifying the text of HTML and XML data


In [490]:

dummyExample = BeautifulSoup(train_data["review"][0])
print(dummyExample.get_text())

"I saw this pilot when it was first shown, and I'm sure countless \"Spirit\" fans hate it, because, like Batman, the Green Hornet etc., it took the character in the direction of \"camp\". But I evidently never got enough of Batman, because I thought it was entertaining, in some of the same ways as that show. There are two parts that stay with me. First, when Denny's partner has been fatally wounded, and he makes a dramatic speech about how he always stood for the law, and obeying the exact letter of it. Then, he says something like, \"Boy, was I stupid!\" Which is his way of telling Denny to become a vigilante instead, which he does (though the TV Batman kind). Then, there's the scene where he tries to seduce the villainess into letting him go by kissing her, but she isn't fooled, because he's too honest to kiss her convincingly ! This was a great example of \"camp\", that was also \"underplayed\", by both the actor and actress."


In [491]:
letters_only = re.sub("[^a-zA-Z]", " ", dummyExample.get_text() )
print(letters_only)

 I saw this pilot when it was first shown  and I m sure countless   Spirit   fans hate it  because  like Batman  the Green Hornet etc   it took the character in the direction of   camp    But I evidently never got enough of Batman  because I thought it was entertaining  in some of the same ways as that show  There are two parts that stay with me  First  when Denny s partner has been fatally wounded  and he makes a dramatic speech about how he always stood for the law  and obeying the exact letter of it  Then  he says something like    Boy  was I stupid    Which is his way of telling Denny to become a vigilante instead  which he does  though the TV Batman kind   Then  there s the scene where he tries to seduce the villainess into letting him go by kissing her  but she isn t fooled  because he s too honest to kiss her convincingly   This was a great example of   camp    that was also   underplayed    by both the actor and actress  


In [492]:
lower_case = letters_only.lower()  
words = lower_case.split()  
print(words)

['i', 'saw', 'this', 'pilot', 'when', 'it', 'was', 'first', 'shown', 'and', 'i', 'm', 'sure', 'countless', 'spirit', 'fans', 'hate', 'it', 'because', 'like', 'batman', 'the', 'green', 'hornet', 'etc', 'it', 'took', 'the', 'character', 'in', 'the', 'direction', 'of', 'camp', 'but', 'i', 'evidently', 'never', 'got', 'enough', 'of', 'batman', 'because', 'i', 'thought', 'it', 'was', 'entertaining', 'in', 'some', 'of', 'the', 'same', 'ways', 'as', 'that', 'show', 'there', 'are', 'two', 'parts', 'that', 'stay', 'with', 'me', 'first', 'when', 'denny', 's', 'partner', 'has', 'been', 'fatally', 'wounded', 'and', 'he', 'makes', 'a', 'dramatic', 'speech', 'about', 'how', 'he', 'always', 'stood', 'for', 'the', 'law', 'and', 'obeying', 'the', 'exact', 'letter', 'of', 'it', 'then', 'he', 'says', 'something', 'like', 'boy', 'was', 'i', 'stupid', 'which', 'is', 'his', 'way', 'of', 'telling', 'denny', 'to', 'become', 'a', 'vigilante', 'instead', 'which', 'he', 'does', 'though', 'the', 'tv', 'batman', '

In [493]:
print(stopwords.words("english") )

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [494]:
words = [w for w in words if not w in stopwords.words("english")]
print(words)

['saw', 'pilot', 'first', 'shown', 'sure', 'countless', 'spirit', 'fans', 'hate', 'like', 'batman', 'green', 'hornet', 'etc', 'took', 'character', 'direction', 'camp', 'evidently', 'never', 'got', 'enough', 'batman', 'thought', 'entertaining', 'ways', 'show', 'two', 'parts', 'stay', 'first', 'denny', 'partner', 'fatally', 'wounded', 'makes', 'dramatic', 'speech', 'always', 'stood', 'law', 'obeying', 'exact', 'letter', 'says', 'something', 'like', 'boy', 'stupid', 'way', 'telling', 'denny', 'become', 'vigilante', 'instead', 'though', 'tv', 'batman', 'kind', 'scene', 'tries', 'seduce', 'villainess', 'letting', 'go', 'kissing', 'fooled', 'honest', 'kiss', 'convincingly', 'great', 'example', 'camp', 'also', 'underplayed', 'actor', 'actress']


##  PreProcessing data for all of the training data

In [495]:
training_data_size = train_data["review"].size
testing_data_size = test_data["review"].size
main_testing_data_size = main_test_data["review"].size

print(training_data_size)
print(testing_data_size)
print(main_testing_data_size)

17500
7500
25000


In [496]:
def clean_text_data(data_point, data_size):
    review_soup = BeautifulSoup(data_point)
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    stop_words = stopwords.words("english")
    meaningful_words = [x for x in review_words if x not in stop_words]
    
    if( (i)%2000 == 0 ):
        print("Cleaned %d of %d data (%d %%)." % ( i, data_size, ((i)/data_size)*100))
        
    return( " ".join( meaningful_words )) 
    

In [497]:
clean_train_data_list = []
clean_test_data_list = []
clean_main_test_data_list = []

##### cleaning training data.

In [498]:
for i in range(training_data_size):
    clean_train_data_list.append(clean_text_data(train_data["review"][i], training_data_size))
print("Cleaning training completed!")

Cleaned 0 of 17500 data (0 %).
Cleaned 2000 of 17500 data (11 %).
Cleaned 4000 of 17500 data (22 %).
Cleaned 6000 of 17500 data (34 %).
Cleaned 8000 of 17500 data (45 %).
Cleaned 10000 of 17500 data (57 %).
Cleaned 12000 of 17500 data (68 %).
Cleaned 14000 of 17500 data (80 %).
Cleaned 16000 of 17500 data (91 %).
Cleaning training completed!


##### cleaning testing data.

In [499]:
for i in range(testing_data_size):
    clean_test_data_list.append(clean_text_data(test_data["review"][i], testing_data_size))
print("Cleaning testing completed!")

Cleaned 0 of 7500 data (0 %).
Cleaned 2000 of 7500 data (26 %).
Cleaned 4000 of 7500 data (53 %).
Cleaned 6000 of 7500 data (80 %).
Cleaning testing completed!


###### cleaning main testing data

In [500]:
for i in range(main_testing_data_size):
    clean_main_test_data_list.append(clean_text_data(main_test_data["review"][i], main_testing_data_size))
print("Cleaning main testing completed!")

Cleaned 0 of 25000 data (0 %).
Cleaned 2000 of 25000 data (8 %).
Cleaned 4000 of 25000 data (16 %).
Cleaned 6000 of 25000 data (24 %).
Cleaned 8000 of 25000 data (32 %).
Cleaned 10000 of 25000 data (40 %).
Cleaned 12000 of 25000 data (48 %).
Cleaned 14000 of 25000 data (56 %).
Cleaned 16000 of 25000 data (64 %).
Cleaned 18000 of 25000 data (72 %).
Cleaned 20000 of 25000 data (80 %).
Cleaned 22000 of 25000 data (88 %).
Cleaned 24000 of 25000 data (96 %).
Cleaning main testing completed!


## getting the features ready to be trained 

In [501]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [502]:
train_data_features = vectorizer.fit_transform(clean_train_data_list)
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

(17500, 5000)


In [503]:
test_data_features = vectorizer.transform(clean_test_data_list)
test_data_features = test_data_features.toarray()
print(test_data_features.shape)

(7500, 5000)


In [505]:
main_test_data_features = vectorizer.transform(clean_main_test_data_list)
main_test_data_features = main_test_data_features.toarray()
print(main_test_data_features.shape)

(25000, 5000)


In [506]:
vocab = vectorizer.get_feature_names()
print(vocab)



In [507]:
dist = np.sum(train_data_features, axis=0)

for tag, count in zip(vocab, dist):
    print(count, tag)

138 abandoned
93 abc
64 abilities
306 ability
897 able
63 abraham
55 abrupt
72 absence
241 absolute
1007 absolutely
56 absorbed
206 absurd
137 abuse
66 abusive
74 abysmal
192 academy
336 accent
144 accents
215 accept
88 acceptable
99 accepted
65 access
218 accident
128 accidentally
57 acclaimed
64 accompanied
57 accomplish
87 accomplished
202 according
135 account
61 accuracy
177 accurate
91 accused
127 achieve
93 achieved
83 achievement
58 acid
679 across
842 act
448 acted
4541 acting
2391 action
221 actions
55 activities
1687 actor
3140 actors
833 actress
246 actresses
271 acts
553 actual
2981 actually
97 ad
213 adam
64 adams
318 adaptation
114 adapted
572 add
284 added
65 addicted
113 adding
249 addition
55 additional
227 adds
81 adequate
85 admire
450 admit
90 admittedly
55 adopted
67 adorable
363 adult
264 adults
76 advance
100 advantage
342 adventure
133 adventures
58 advertising
188 advice
55 advise
260 affair
68 affect
82 affected
73 afford
97 aforementioned
243 afraid
138 afri

455 comment
227 commentary
66 commented
537 comments
166 commercial
79 commercials
94 commit
121 committed
332 common
71 communist
205 community
62 companies
76 companion
357 company
234 compare
361 compared
69 comparing
175 comparison
56 compassion
68 compelled
268 compelling
105 competent
98 competition
77 complain
62 complaining
100 complaint
59 complaints
707 complete
1339 completely
299 complex
62 complexity
130 complicated
79 composed
61 composer
325 computer
129 con
88 conceived
355 concept
73 concern
187 concerned
73 concerning
101 concerns
89 concert
341 conclusion
105 condition
58 confess
73 confidence
193 conflict
56 conflicts
57 confrontation
263 confused
256 confusing
111 confusion
87 connect
107 connected
176 connection
63 connery
59 conscience
66 conscious
92 consequences
64 conservative
326 consider
68 considerable
345 considered
390 considering
71 consistent
75 consistently
103 consists
88 conspiracy
199 constant
287 constantly
71 constructed
69 construction
104 contac

87 heat
226 heaven
121 heavily
345 heavy
149 heck
60 heights
57 heist
251 held
110 helen
75 helicopter
728 hell
60 hello
1377 help
228 helped
122 helping
254 helps
117 hence
288 henry
59 hepburn
701 hero
213 heroes
83 heroic
187 heroine
97 heston
284 hey
231 hidden
152 hide
63 hideous
106 hiding
1484 high
211 higher
79 highest
154 highlight
86 highlights
814 highly
709 hilarious
60 hilariously
176 hill
113 hills
98 hint
72 hints
124 hip
61 hippie
85 hire
135 hired
280 historical
58 historically
918 history
737 hit
144 hitchcock
238 hitler
195 hits
91 hitting
78 ho
147 hoffman
358 hold
140 holding
211 holds
122 hole
237 holes
96 holiday
77 hollow
92 holly
1310 hollywood
112 holmes
73 holy
102 homage
1311 home
89 homeless
81 homer
62 homosexual
352 honest
319 honestly
66 honesty
135 hong
121 honor
119 hood
67 hook
92 hooked
68 hop
1025 hope
98 hoped
163 hopefully
57 hopeless
59 hopelessly
187 hopes
299 hoping
79 hopper
88 horrendous
853 horrible
144 horribly
76 horrid
111 horrific
63 hor

92 piano
335 pick
228 picked
89 picking
114 picks
1027 picture
315 pictures
78 pie
1062 piece
319 pieces
61 pierce
77 pig
154 pile
221 pilot
64 pink
67 pit
103 pitch
57 pitiful
129 pitt
160 pity
1696 place
130 placed
277 places
111 plague
398 plain
286 plan
246 plane
356 planet
68 planned
93 planning
143 plans
66 plant
112 plastic
58 plausible
1539 play
1800 played
207 player
201 players
1143 playing
1582 plays
162 pleasant
80 pleasantly
754 please
90 pleased
216 pleasure
428 plenty
66 plight
4660 plot
200 plots
448 plus
70 poetic
62 poetry
103 poignant
2246 point
89 pointed
357 pointless
592 points
61 poison
74 polanski
778 police
61 polished
435 political
71 politically
150 politics
101 pool
1329 poor
513 poorly
205 pop
74 popcorn
57 pops
388 popular
60 popularity
83 population
261 porn
67 porno
79 portion
106 portrait
181 portray
347 portrayal
403 portrayed
165 portraying
162 portrays
59 posey
124 position
343 positive
55 positively
71 possessed
67 possibilities
76 possibility
733 p

208 toward
439 towards
908 town
122 toy
76 toys
280 track
66 tracks
109 tracy
92 trade
57 trademark
115 tradition
166 traditional
268 tragedy
255 tragic
64 trail
281 trailer
74 trailers
296 train
68 trained
157 training
66 transfer
66 transformation
64 transition
77 translation
73 trap
128 trapped
361 trash
66 trashy
163 travel
72 traveling
81 travels
58 travesty
121 treasure
230 treat
193 treated
170 treatment
72 treats
119 tree
74 trees
189 trek
100 tremendous
117 trial
62 tribe
99 tribute
115 trick
98 tricks
533 tried
898 tries
148 trilogy
82 trio
318 trip
55 tripe
100 trite
86 triumph
62 troops
362 trouble
98 troubled
60 troubles
136 truck
1663 true
1186 truly
227 trust
489 truth
1285 try
1718 trying
59 tube
112 tune
86 tunes
128 turkey
971 turn
627 turned
91 turner
245 turning
894 turns
1933 tv
78 twelve
190 twenty
271 twice
60 twilight
92 twin
430 twist
136 twisted
309 twists
4842 two
783 type
164 types
519 typical
89 typically
245 ugly
157 uk
190 ultimate
365 ultimately
108 ultr

## Training the model

In [517]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

forest = RandomForestClassifier(n_estimators = 150) 
forest = forest.fit( train_data_features, train_data["sentiment"])

## Testing the model

In [516]:
predictions = forest.predict(test_data_features) 
print(accuracy_score(test_data['sentiment'], predictions))

0.8414666666666667


## Creating the output submission file

In [511]:
result = forest.predict(main_test_data_features) 
output = pd.DataFrame( data={"id":main_test_data["id"], "sentiment":result} )
output.to_csv( "paras_submission.csv", index=False, quoting=3 )
