In [0]:
from google.colab import files

import pandas as pd
import numpy as np
import re
import io
import os
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup

<center>**This demo is very much based on the this [Kaggle solution by Angela Chapman][ac].**</center>

[ac]: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words "Bag-ofWords Kaggle solution"

**NOTE:** At this moment I can't import NLTK\_data into Google Colab, so the entire implementation is sk-learn-based. Relevant referrences to NLTK are given throughout.

# Get the data

## Train data

Our data is located somewhere on our local machine, and Google Colab allows us to upload it into our remote Colab server. Reading in the [official Google Colab docs][ogcd] and some more in Stack Overflow I got the following script.

[ogcd]: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=vz-jH8T_Uk2c "Google Colab and data resources"

<center>**WARNING:** This upload takes time</center>

In [110]:
uploaded = files.upload()

Saving labeledTrainData.tsv to labeledTrainData (3).tsv


In [0]:
train_orig = pd.read_csv(io.StringIO(uploaded['labeledTrainData.tsv'].decode('utf-8')), 
                         header=0, delimiter="\t", quoting=3)

In [112]:
train_orig.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


We sample the data to reduce execution time (for class only).

In [0]:
train = train_orig.sample(n=1000, random_state=0)

## Stopwords

since I couldn't import the corpus into Google Colab (hence the comment), I put instead a file with stopwords. Finally, *en_sw* is a set of English stopwords.

In [114]:
uploaded = files.upload()  # import the CORRECT file!
en_sw = set(uploaded['en_sw.txt'].split('\n')[:-1])
print(list(en_sw)[::10])

Saving en_sw.txt to en_sw (7).txt
['all', 'hadn', 'has', 'nor', 'because', 'while', 'were', 's', 'from', 'weren', 'those', 'can', 'an', 'when', 'most', 'the']


Alternative code with NLTK\_data installed:

In [0]:
# from nltk.corpus import stopwords
# en_sw = set(stopwords.words("english"))

# Text manipulation

## Step by Step

We can demonstrate the process with actual reviews.

In [116]:
my_review = train.iloc[1, 2]
print(my_review.replace('.', '\n'))

"I really like Miikes movies about Yakuza, this one I saw about 2 years ago and it really fu**ed my head
 Never before seen such a sick and twisted thing
 The Story is good and the actors do their thing very well
 I haven't seen the UK or Japan version, but I have to say that I believe that the German DVD is a bit censored
 If you haven't seen the movie already and live in Germany maybe you better look out for a DVD from the Nederlands or Austria
 The I-ON DVD contains a lot of very hard and nasty scenes, but at the showdown I felt that something was missing, about one or two very short scenes
<br /><br />All in all a good perverted movie with crazy characters and a high level of violence, that's what I like Miike for!!"


### Lower case

Nothing to explain here...

In [0]:
my_review = my_review.lower()

### Cleaning HTML

When the source of the data is web scraping, we should first remove HTML leftovers, and the best way to do it is with [BeautifulSoup][bs].

We apply here the [*get_text()*][gt] method of the *Tag* class.

[bs]: https://www.crummy.com/software/BeautifulSoup/ "BeautifulSoup Home Page"
[gt]: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text "get_text() API"

In [118]:
my_review = BeautifulSoup(my_review, 'html5lib').get_text()
print(my_review.replace('.', '\n'))  # For convenience

"i really like miikes movies about yakuza, this one i saw about 2 years ago and it really fu**ed my head
 never before seen such a sick and twisted thing
 the story is good and the actors do their thing very well
 i haven't seen the uk or japan version, but i have to say that i believe that the german dvd is a bit censored
 if you haven't seen the movie already and live in germany maybe you better look out for a dvd from the nederlands or austria
 the i-on dvd contains a lot of very hard and nasty scenes, but at the showdown i felt that something was missing, about one or two very short scenes
all in all a good perverted movie with crazy characters and a high level of violence, that's what i like miike for!!"


### Tokenizing

Most texts include many non-text characters (periods, commas, parentheses, etc.), which should be either cleaned or transformed.

In this demo we simply remove all characters which are not letters, and it is performed by the [re.sub()][sub] method.

[sub]: https://docs.python.org/3.5/library/re.html#re.regex.sub "re.sub docs"

In [119]:
my_review = re.sub("[^a-zA-Z]"," ", my_review).split()
print(my_review)

[u'i', u'really', u'like', u'miikes', u'movies', u'about', u'yakuza', u'this', u'one', u'i', u'saw', u'about', u'years', u'ago', u'and', u'it', u'really', u'fu', u'ed', u'my', u'head', u'never', u'before', u'seen', u'such', u'a', u'sick', u'and', u'twisted', u'thing', u'the', u'story', u'is', u'good', u'and', u'the', u'actors', u'do', u'their', u'thing', u'very', u'well', u'i', u'haven', u't', u'seen', u'the', u'uk', u'or', u'japan', u'version', u'but', u'i', u'have', u'to', u'say', u'that', u'i', u'believe', u'that', u'the', u'german', u'dvd', u'is', u'a', u'bit', u'censored', u'if', u'you', u'haven', u't', u'seen', u'the', u'movie', u'already', u'and', u'live', u'in', u'germany', u'maybe', u'you', u'better', u'look', u'out', u'for', u'a', u'dvd', u'from', u'the', u'nederlands', u'or', u'austria', u'the', u'i', u'on', u'dvd', u'contains', u'a', u'lot', u'of', u'very', u'hard', u'and', u'nasty', u'scenes', u'but', u'at', u'the', u'showdown', u'i', u'felt', u'that', u'something', u'was'

This step can also be performed by available tokenizers, and when NLTK\_data will work, then we can do the following:

In [0]:
# from nltk.tokenize import word_tokenize
# my_review = word_tokenize(my_review)

### Stopwords

We remove the stopwords from the bag of words.

In [121]:
my_review = [w for w in my_review if w not in en_sw]
print my_review

[u'really', u'like', u'miikes', u'movies', u'yakuza', u'one', u'saw', u'years', u'ago', u'really', u'fu', u'ed', u'head', u'never', u'seen', u'sick', u'twisted', u'thing', u'story', u'good', u'actors', u'thing', u'well', u'seen', u'uk', u'japan', u'version', u'say', u'believe', u'german', u'dvd', u'bit', u'censored', u'seen', u'movie', u'already', u'live', u'germany', u'maybe', u'better', u'look', u'dvd', u'nederlands', u'austria', u'dvd', u'contains', u'lot', u'hard', u'nasty', u'scenes', u'showdown', u'felt', u'something', u'missing', u'one', u'two', u'short', u'scenes', u'good', u'perverted', u'movie', u'crazy', u'characters', u'high', u'level', u'violence', u'like', u'miike']


## Applying the process

In [0]:
def review_to_wordlist(review, remove_stopwords=False):
    review = review.lower()
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]"," ", review).split()
    if remove_stopwords:
        review = [w for w in review if w not in en_sw]
    return ' '.join(review)

**NOTE:** A single review is processed in a very short time, but we should be careful when applying it to thousands of reviews.

In [123]:
clean_train_reviews = train['review'].apply(review_to_wordlist, remove_stopwords=True)
clean_train_reviews.head()

14149    vaguely remember ben sci fi fandom days severa...
8946     really like miikes movies yakuza one saw years...
22378    natural born killers cinema cut r director cut...
12162    tobe hooper made great movies certain bad read...
4879     santa movie starts strange think santa might p...
Name: review, dtype: object

# Feature engineering

We use the [*CountVectorizer*][cv] class to transform our bag of words into numbers. read the docs to learn the various vectorizer parameters.

[cv]: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html "CountVectorizer API"

In [0]:
vectorizer = CountVectorizer(max_features=500)

In [0]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [126]:
train_data_features

<1000x500 sparse matrix of type '<type 'numpy.int64'>'
	with 42010 stored elements in Compressed Sparse Row format>

Potentially we can apply another transformation using the tf-idf extraction.

# Modelling

In [0]:
forest = RandomForestClassifier(n_estimators=100, max_depth=5, max_features=0.1)

In [128]:
forest.fit(train_data_features, train["sentiment"] )


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
pred_train = forest.predict(train_data_features)

In [130]:
forest.score(train_data_features, pred_train)

1.0

In [131]:
uploaded = files.upload()
test_orig = pd.read_csv(io.StringIO(uploaded['testData.tsv'].decode('utf-8')), 
                        header=0, delimiter="\t", quoting=3)

Saving testData.tsv to testData (1).tsv


In [0]:
test = test_orig.copy()

In [0]:
# test = test_orig.sample(n=1000, random_state=1)

In [134]:
t1 = time()
clean_test_reviews = test['review'].apply(review_to_wordlist, remove_stopwords=True)
print time()-t1

162.642829895


In [0]:
test_data_features = vectorizer.transform(clean_test_reviews)

In [136]:
test_data_features

<25000x500 sparse matrix of type '<type 'numpy.int64'>'
	with 1014904 stored elements in Compressed Sparse Row format>

In [0]:
result = forest.predict(test_data_features)

In [0]:
output = pd.DataFrame({"id": test["id"], "sentiment": result})

In [0]:
output.to_csv('test_submission.csv', index=False, quoting=3)

In [0]:
files.download('test_submission.csv')