In [0]:
from google.colab import files

import pandas as pd
import numpy as np
import re
import io
import os
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup

In [0]:
def print_lines(lst, words_per_line=10):
  for i, word in enumerate(lst, 1):
    print word,
    if i % words_per_line==0:
      print

<center>**This demo is very much based on the this [Kaggle solution by Angela Chapman][ac].**</center>

[ac]: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words "Bag-ofWords Kaggle solution"

**NOTE:** At this moment I can't import NLTK\_data into Google Colab, so the entire implementation is sk-learn-based. Relevant referrences to NLTK are given throughout.

# Get the data

## Train data

Our data is located somewhere on our local machine, and Google Colab allows us to upload it into our remote Colab server. Reading in the [official Google Colab docs][ogcd] and some more in Stack Overflow I got the following script.

[ogcd]: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=vz-jH8T_Uk2c "Google Colab and data resources"

<center>**WARNING:** This upload takes time</center>

In [62]:
uploaded = files.upload()

Saving labeledTrainData.tsv to labeledTrainData (2).tsv


In [0]:
train_orig = pd.read_csv(io.StringIO(uploaded['labeledTrainData.tsv'].decode('utf-8')), 
                         header=0, delimiter="\t", quoting=3)

In [64]:
train_orig.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


We sample the data to reduce execution time (for class only).

In [0]:
train = train_orig.sample(n=1000, random_state=0)

## Stopwords

since I couldn't import the corpus into Google Colab (hence the comment), I put instead a file with stopwords. Finally, *en_sw* is a set of English stopwords.

In [66]:
uploaded = files.upload()  # import the CORRECT file!
en_sw = set(uploaded['en_sw.txt'].split('\n')[:-1])
print(list(en_sw)[::10])

Saving en_sw.txt to en_sw (1).txt
['all', 'hadn', 'has', 'nor', 'because', 'while', 'were', 's', 'from', 'weren', 'those', 'can', 'an', 'when', 'most', 'the']


Alternative code with NLTK\_data installed:

In [0]:
# from nltk.corpus import stopwords
# en_sw = set(stopwords.words("english"))

# Text manipulation

## Step by Step

We can demonstrate the process with actual reviews.

In [84]:
my_review = train.iloc[1, 2]
print(my_review.replace('.', '\n'))

"I really like Miikes movies about Yakuza, this one I saw about 2 years ago and it really fu**ed my head
 Never before seen such a sick and twisted thing
 The Story is good and the actors do their thing very well
 I haven't seen the UK or Japan version, but I have to say that I believe that the German DVD is a bit censored
 If you haven't seen the movie already and live in Germany maybe you better look out for a DVD from the Nederlands or Austria
 The I-ON DVD contains a lot of very hard and nasty scenes, but at the showdown I felt that something was missing, about one or two very short scenes
<br /><br />All in all a good perverted movie with crazy characters and a high level of violence, that's what I like Miike for!!"


### Lower case

Nothing to explain here...

In [0]:
my_review = my_review.lower()

### Cleaning HTML

When the source of the data is web scraping, we should first remove HTML leftovers, and the best way to do it is with [BeautifulSoup][bs].

We apply here the [*get_text()*][gt] method of the *Tag* class.

[bs]: https://www.crummy.com/software/BeautifulSoup/ "BeautifulSoup Home Page"
[gt]: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text "get_text() API"

In [86]:
my_review = BeautifulSoup(my_review, 'html5lib').get_text()
print(my_review.replace('.', '\n'))  # For convenience

"i really like miikes movies about yakuza, this one i saw about 2 years ago and it really fu**ed my head
 never before seen such a sick and twisted thing
 the story is good and the actors do their thing very well
 i haven't seen the uk or japan version, but i have to say that i believe that the german dvd is a bit censored
 if you haven't seen the movie already and live in germany maybe you better look out for a dvd from the nederlands or austria
 the i-on dvd contains a lot of very hard and nasty scenes, but at the showdown i felt that something was missing, about one or two very short scenes
all in all a good perverted movie with crazy characters and a high level of violence, that's what i like miike for!!"


### Tokenizing

Most texts include many non-text characters (periods, commas, parentheses, etc.), which should be either cleaned or transformed.

In this demo we simply remove all characters which are not letters, and it is performed by the [re.sub()][sub] method.

[sub]: https://docs.python.org/3.5/library/re.html#re.regex.sub "re.sub docs"

In [0]:
my_review = re.sub("don\'t like","dont-like", my_review).split()
my_review = re.sub("\W"," ", my_review).split()


In [88]:
print_lines(my_review, 15)

i really like miikes movies about yakuza this one i saw about 2 years ago
and it really fu ed my head never before seen such a sick and twisted
thing the story is good and the actors do their thing very well i haven
t seen the uk or japan version but i have to say that i believe
that the german dvd is a bit censored if you haven t seen the movie
already and live in germany maybe you better look out for a dvd from the
nederlands or austria the i on dvd contains a lot of very hard and nasty
scenes but at the showdown i felt that something was missing about one or two
very short scenes all in all a good perverted movie with crazy characters and a
high level of violence that s what i like miike for


This step can also be performed by available tokenizers, and when NLTK\_data will work, then we can do the following:

In [0]:
# from nltk.tokenize import word_tokenize
# my_review = word_tokenize(my_review)

### Stopwords

We remove the stopwords from the bag of words.

In [90]:
my_review = [w for w in my_review if w not in en_sw]
print_lines(my_review, 15)

really like miikes movies yakuza one saw 2 years ago really fu ed head never
seen sick twisted thing story good actors thing well seen uk japan version say believe
german dvd bit censored seen movie already live germany maybe better look dvd nederlands austria
dvd contains lot hard nasty scenes showdown felt something missing one two short scenes good
perverted movie crazy characters high level violence like miike


## Applying the process

In [0]:
def review_to_wordlist(review, remove_stopwords=False):
    review = review.lower()
    review = BeautifulSoup(review, 'html5lib').get_text()
    review = re.sub("\W"," ", review).split()
    if remove_stopwords:
        review = [w for w in review if w not in en_sw]
    return ' '.join(review)

**NOTE:** A single review is processed in a very short time, but we should be careful when applying it to thousands of reviews.

In [92]:
clean_train_reviews = train['review'].apply(review_to_wordlist, remove_stopwords=True)
clean_train_reviews.head()

14149    vaguely remember ben sci fi fandom days 60s se...
8946     really like miikes movies yakuza one saw 2 yea...
22378    natural born killers 1994 cinema cut r directo...
12162    tobe hooper made great movies certain bad read...
4879     santa movie starts strange think santa might p...
Name: review, dtype: object

# Feature engineering

We use the [*CountVectorizer*][cv] class to transform our bag of words into numbers. read the docs to learn the various vectorizer parameters.

[cv]: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html "CountVectorizer API"

In [0]:
vectorizer = CountVectorizer(max_features=1000, binary=False, max_df=0.3)

In [0]:
train_data_features = vectorizer.fit(clean_train_reviews).transform(clean_train_reviews)

In [0]:
words, counts = [], []
for word, ind in vectorizer.vocabulary_.items():
  words.append(word)
  counts.append(train_data_features[:, ind].sum())

df = pd.DataFrame({'Word': words, 'Count': counts})

In [155]:
df.sort_values('Count', ascending=False).head(100)

Unnamed: 0,Count,Word
452,472,story
529,409,people
24,397,much
455,394,well
596,385,great
291,368,first
788,359,bad
302,356,get
75,342,also
489,338,made


Potentially we can apply another transformation using the tf-idf extraction.

# Modelling

In [0]:
forest = RandomForestClassifier(n_estimators=100, max_depth=5, max_features=0.1)

In [157]:
forest.fit(train_data_features, train["sentiment"] )


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
pred_train = forest.predict(train_data_features)

In [159]:
forest.score(train_data_features, pred_train)

1.0

In [50]:
# uploaded = files.upload()
# test_orig = pd.read_csv(io.StringIO(uploaded['testData.tsv'].decode('utf-8')), 
#                         header=0, delimiter="\t", quoting=3)

Saving testData.tsv to testData.tsv


In [0]:
test = test_orig.copy()

In [92]:
# test = test_orig.sample(n=1000, random_state=1)

In [160]:
t1 = time()
clean_test_reviews = test['review'].apply(review_to_wordlist, remove_stopwords=True)
print time()-t1

154.368343115


In [0]:
test_data_features = vectorizer.transform(clean_test_reviews)

In [162]:
test_data_features

<25000x1000 sparse matrix of type '<type 'numpy.int64'>'
	with 1199976 stored elements in Compressed Sparse Row format>

In [0]:
result = forest.predict(test_data_features)

In [0]:
output = pd.DataFrame({"id": test["id"], "sentiment": result})

In [0]:
output.to_csv('test_submission2.csv', index=False, quoting=3)

In [0]:
files.download('test_submission2.csv')