In [1]:
import os
import re
import numpy as np
import pandas as pd
import logging
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
import nltk
from nltk.corpus import stopwords

In [3]:
datafile = os.path.join('all','labeledTrainData.tsv')

In [4]:
df = pd.read_csv(datafile,sep='\t',escapechar='\\')
print('Number of reviews:{}'.format(len(df)))
df.head()

Number of reviews:25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
df['review']

0        With all this stuff going down at the moment w...
1        "The Classic War of the Worlds" by Timothy Hin...
2        The film starts with a manager (Nicholas Bell)...
3        It must be assumed that those who praised this...
4        Superbly trashy and wondrously unpretentious 8...
5        I dont know why people think this is such a ba...
6        This movie could have been very good, but come...
7        I watched this video at a friend's house. I'm ...
8        A friend of mine bought this film for £1, and ...
9        <br /><br />This movie is full of references. ...
10       What happens when an army of wetbacks, towelhe...
11       Although I generally do not like remakes belie...
12       "Mr. Harvey Lights a Candle" is anchored by a ...
13       I had a feeling that after "Submerged", this o...
14       note to George Litman, and others: the Mystery...
15       Stephen King adaptation (scripted by King hims...
16       `The Matrix' was an exciting summer blockbuste.

In [6]:
# just to check the data
def display(text, title):
    print "~~~~~~~~~~~~~~~~~~~~"
    print "TITLE : ",title
    print "TEXT : "
    print text
    print 

In [7]:
raw_example = df['review'][3]
display(raw_example,'Original Data')

~~~~~~~~~~~~~~~~~~~~
TITLE :  Original Data
TEXT : 
It must be assumed that those who praised this film ("the greatest filmed opera ever," didn't I read somewhere?) either don't care for opera, don't care for Wagner, or don't care about anything except their desire to appear Cultured. Either as a representation of Wagner's swan-song, or as a movie, this strikes me as an unmitigated disaster, with a leaden reading of the score matched to a tricksy, lugubrious realisation of the text.<br /><br />It's questionable that people with ideas as to what an opera (or, for that matter, a play, especially one by Shakespeare) is "about" should be allowed anywhere near a theatre or film studio; Syberberg, very fashionably, but without the smallest justification from Wagner's text, decided that Parsifal is "about" bisexual integration, so that the title character, in the latter stages, transmutes into a kind of beatnik babe, though one who continues to sing high tenor -- few if any of the actors in t

In [8]:
# use beautiful soup for cleaning the html tags in the text message
html_example = BeautifulSoup(raw_example, 'html.parser').get_text()
display(html_example, "Text after cleaning the html tags")

~~~~~~~~~~~~~~~~~~~~
TITLE :  Text after cleaning the html tags
TEXT : 
It must be assumed that those who praised this film ("the greatest filmed opera ever," didn't I read somewhere?) either don't care for opera, don't care for Wagner, or don't care about anything except their desire to appear Cultured. Either as a representation of Wagner's swan-song, or as a movie, this strikes me as an unmitigated disaster, with a leaden reading of the score matched to a tricksy, lugubrious realisation of the text.It's questionable that people with ideas as to what an opera (or, for that matter, a play, especially one by Shakespeare) is "about" should be allowed anywhere near a theatre or film studio; Syberberg, very fashionably, but without the smallest justification from Wagner's text, decided that Parsifal is "about" bisexual integration, so that the title character, in the latter stages, transmutes into a kind of beatnik babe, though one who continues to sing high tenor -- few if any of the act

In [9]:
# clean simbols
letter_example = re.sub(r'[^a-zA-Z]',' ',html_example)
display(letter_example,'Without simbols')

~~~~~~~~~~~~~~~~~~~~
TITLE :  Without simbols
TEXT : 
It must be assumed that those who praised this film   the greatest filmed opera ever   didn t I read somewhere   either don t care for opera  don t care for Wagner  or don t care about anything except their desire to appear Cultured  Either as a representation of Wagner s swan song  or as a movie  this strikes me as an unmitigated disaster  with a leaden reading of the score matched to a tricksy  lugubrious realisation of the text It s questionable that people with ideas as to what an opera  or  for that matter  a play  especially one by Shakespeare  is  about  should be allowed anywhere near a theatre or film studio  Syberberg  very fashionably  but without the smallest justification from Wagner s text  decided that Parsifal is  about  bisexual integration  so that the title character  in the latter stages  transmutes into a kind of beatnik babe  though one who continues to sing high tenor    few if any of the actors in the film ar

In [10]:
# normalize and split for analyzing
words = letter_example.lower().split()
display(words, 'Words list')

~~~~~~~~~~~~~~~~~~~~
TITLE :  Words list
TEXT : 
[u'it', u'must', u'be', u'assumed', u'that', u'those', u'who', u'praised', u'this', u'film', u'the', u'greatest', u'filmed', u'opera', u'ever', u'didn', u't', u'i', u'read', u'somewhere', u'either', u'don', u't', u'care', u'for', u'opera', u'don', u't', u'care', u'for', u'wagner', u'or', u'don', u't', u'care', u'about', u'anything', u'except', u'their', u'desire', u'to', u'appear', u'cultured', u'either', u'as', u'a', u'representation', u'of', u'wagner', u's', u'swan', u'song', u'or', u'as', u'a', u'movie', u'this', u'strikes', u'me', u'as', u'an', u'unmitigated', u'disaster', u'with', u'a', u'leaden', u'reading', u'of', u'the', u'score', u'matched', u'to', u'a', u'tricksy', u'lugubrious', u'realisation', u'of', u'the', u'text', u'it', u's', u'questionable', u'that', u'people', u'with', u'ideas', u'as', u'to', u'what', u'an', u'opera', u'or', u'for', u'that', u'matter', u'a', u'play', u'especially', u'one', u'by', u'shakespeare', u'is', 

In [11]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [12]:
#stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])
stopWords = set(stopwords.words('english'))
words_nostop = [w for w in words if w not in stopWords]
display(words_nostop , "Clean the stop words")

~~~~~~~~~~~~~~~~~~~~
TITLE :  Clean the stop words
TEXT : 
[u'must', u'assumed', u'praised', u'film', u'greatest', u'filmed', u'opera', u'ever', u'read', u'somewhere', u'either', u'care', u'opera', u'care', u'wagner', u'care', u'anything', u'except', u'desire', u'appear', u'cultured', u'either', u'representation', u'wagner', u'swan', u'song', u'movie', u'strikes', u'unmitigated', u'disaster', u'leaden', u'reading', u'score', u'matched', u'tricksy', u'lugubrious', u'realisation', u'text', u'questionable', u'people', u'ideas', u'opera', u'matter', u'play', u'especially', u'one', u'shakespeare', u'allowed', u'anywhere', u'near', u'theatre', u'film', u'studio', u'syberberg', u'fashionably', u'without', u'smallest', u'justification', u'wagner', u'text', u'decided', u'parsifal', u'bisexual', u'integration', u'title', u'character', u'latter', u'stages', u'transmutes', u'kind', u'beatnik', u'babe', u'though', u'one', u'continues', u'sing', u'high', u'tenor', u'actors', u'film', u'singers', u'g

In [13]:
# define a overall function
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    words = [w for w in words if w not in stopWords]
    return  ' '.join(words)

In [14]:
clean_text(raw_example)

u'must assumed praised film greatest filmed opera ever read somewhere either care opera care wagner care anything except desire appear cultured either representation wagner swan song movie strikes unmitigated disaster leaden reading score matched tricksy lugubrious realisation text questionable people ideas opera matter play especially one shakespeare allowed anywhere near theatre film studio syberberg fashionably without smallest justification wagner text decided parsifal bisexual integration title character latter stages transmutes kind beatnik babe though one continues sing high tenor actors film singers get double dose armin jordan conductor seen face heard voice amfortas also appears monstrously double exposure kind batonzilla conductor ate monsalvat playing good friday music way transcendant loveliness nature represented scattering shopworn flaccid crocuses stuck ill laid turf expedient baffles theatre sometimes piece imperfections thoughts think syberberg splice parsifal gurnema

In [15]:
# clean the dataframe
df['clean_review'] = df['review'].apply(clean_text)
df.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


In [16]:
vectorizer = CountVectorizer(max_features=5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape

(25000L, 5000L)

In [17]:
# start training using random forest
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features,df.sentiment)

In [18]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]], dtype=int64)

In [19]:
del df
del train_data_features

In [20]:
datafile = os.path.join('all','testData.tsv')
df = pd.read_csv(datafile,sep='\t',escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)
df.head()

Number of reviews: 25000


Unnamed: 0,id,review,clean_review
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids saw tonight child loved one point k...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression several different ...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life filmed ...


In [21]:
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape

(25000L, 5000L)

In [22]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})

In [23]:
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [24]:
pd.DataFrame({'id':df.id,'sentiment':result}).to_csv('submission.csv',index=False)