In [67]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download("punkt")
from gensim.models import word2vec
from gensim.models import Word2Vec

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import describe

from sklearn.cluster import KMeans
import time

import logging

%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Read in Data

In [14]:
train = pd.read_csv("../data/labeledTrainData.tsv", header = 0, delimiter = "\t", quoting=3, encoding = 'utf-8')

In [15]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
train.shape

(25000, 3)

In [16]:
unlabeled_train = pd.read_csv("../data/unlabeledTrainData.tsv", header = 0, 
                              delimiter = '\t', quoting=3, encoding = 'utf-8')
unlabeled_train.shape

(50000, 2)

In [17]:
test = pd.read_csv("../data/testData.tsv", header = 0, delimiter = '\t', quoting=3, encoding = 'utf-8')
test.shape

(25000, 2)

# Data Cleaning


## Review to Words

In [11]:
def review_to_wordlist( raw_review, remove_stopwords=False ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    
    if remove_stopwords:
        # 4. In Python, searching a set is much faster than searching
        #   a list, so convert the stop words to a set
        stops = set(stopwords.words("english"))                  
        # 
        # 5. Remove stop words
        words = [w for w in words if not w in stops]   
    
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return words

## Review to Sentences

In [10]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [12]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    ## split review into sentences with NLTK
    ## 1. split into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    ## 2. loop over sentences
    sentences = []
    for rs in raw_sentences:
        if len(rs) > 0:
            sentences.append(review_to_wordlist(rs, remove_stopwords))
        
    return sentences

## Prepare Sentences

In [21]:
sentences = []

print("Parsing Training Data")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing Unlabeled Training Data")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing Training Data
Parsing Unlabeled Training Data


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [27]:
len(sentences)

795538

## Prepare Training Data for Bag of Words

In [25]:
clean_train_reviews = []
nreviews = train["review"].size

for i in range():
    if i % 1000 == 0:
        print("Processing review {}/{}".format(i,nreviews))
    clean_train_reviews.append(review_to_words( train["review"][i] ))

In [26]:
len(clean_train_reviews)

25000

# Word2Vec Training

In [24]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [25]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [26]:
print("Training Word2Vec Model...")
model = word2vec.Word2Vec(sentences, 
                          workers = num_workers, 
                          size=num_features, 
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling)

## For memory efficiency if no plans to further train..
# model.init_sims(replace=True)

2017-11-11 10:14:40,266 : INFO : collecting all words and their counts
2017-11-11 10:14:40,268 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-11 10:14:40,398 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


Training Word2Vec Model...


2017-11-11 10:14:40,492 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2017-11-11 10:14:40,571 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-11-11 10:14:40,661 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-11-11 10:14:40,762 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-11-11 10:14:40,848 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-11-11 10:14:40,935 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-11-11 10:14:41,023 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-11-11 10:14:41,111 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-11-11 10:14:41,189 : INFO : PROGRESS: at sentence #100000, processed 2226967 words, keeping 50

2017-11-11 10:14:46,823 : INFO : PROGRESS: at sentence #740000, processed 16553079 words, keeping 119668 word types
2017-11-11 10:14:46,910 : INFO : PROGRESS: at sentence #750000, processed 16771406 words, keeping 120295 word types
2017-11-11 10:14:46,995 : INFO : PROGRESS: at sentence #760000, processed 16990810 words, keeping 120930 word types
2017-11-11 10:14:47,084 : INFO : PROGRESS: at sentence #770000, processed 17217947 words, keeping 121703 word types
2017-11-11 10:14:47,175 : INFO : PROGRESS: at sentence #780000, processed 17448093 words, keeping 122402 word types
2017-11-11 10:14:47,265 : INFO : PROGRESS: at sentence #790000, processed 17675169 words, keeping 123066 word types
2017-11-11 10:14:47,315 : INFO : collected 123504 word types from a corpus of 17798270 raw words and 795538 sentences
2017-11-11 10:14:47,315 : INFO : Loading a fresh vocabulary
2017-11-11 10:14:47,432 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2017-11-11 10:

2017-11-11 10:15:52,493 : INFO : PROGRESS: at 79.85% examples, 788386 words/s, in_qsize 16, out_qsize 0
2017-11-11 10:15:53,510 : INFO : PROGRESS: at 81.11% examples, 788374 words/s, in_qsize 15, out_qsize 0
2017-11-11 10:15:54,517 : INFO : PROGRESS: at 82.34% examples, 788151 words/s, in_qsize 15, out_qsize 3
2017-11-11 10:15:55,523 : INFO : PROGRESS: at 83.61% examples, 788180 words/s, in_qsize 14, out_qsize 1
2017-11-11 10:15:56,531 : INFO : PROGRESS: at 84.86% examples, 788162 words/s, in_qsize 16, out_qsize 0
2017-11-11 10:15:57,533 : INFO : PROGRESS: at 86.14% examples, 788517 words/s, in_qsize 15, out_qsize 0
2017-11-11 10:15:58,540 : INFO : PROGRESS: at 87.42% examples, 788823 words/s, in_qsize 15, out_qsize 0
2017-11-11 10:15:59,540 : INFO : PROGRESS: at 88.65% examples, 788765 words/s, in_qsize 16, out_qsize 2
2017-11-11 10:16:00,541 : INFO : PROGRESS: at 89.88% examples, 788733 words/s, in_qsize 16, out_qsize 0
2017-11-11 10:16:01,550 : INFO : PROGRESS: at 91.11% examples, 7

In [28]:
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-11-11 10:17:01,919 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2017-11-11 10:17:01,921 : INFO : not storing attribute syn0norm
2017-11-11 10:17:01,923 : INFO : not storing attribute cum_table
2017-11-11 10:17:02,145 : INFO : saved 300features_40minwords_10context


### playing with model

In [29]:
model.doesnt_match("man woman child kitchen".split())

2017-11-11 10:17:45,770 : INFO : precomputing L2-norms of word weight vectors


'kitchen'

In [30]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [31]:
model.doesnt_match("paris berlin london austria".split())

'paris'

In [32]:
model.most_similar("man")

[(u'woman', 0.6210911870002747),
 (u'lady', 0.5788845419883728),
 (u'lad', 0.5675628185272217),
 (u'monk', 0.5514678955078125),
 (u'guy', 0.523241400718689),
 (u'men', 0.5219612121582031),
 (u'millionaire', 0.5186538696289062),
 (u'farmer', 0.5138608813285828),
 (u'doctor', 0.5085681676864624),
 (u'soldier', 0.5004254579544067)]

In [33]:
model.most_similar("queen")

[(u'princess', 0.6849184632301331),
 (u'stepmother', 0.6160538792610168),
 (u'bride', 0.6057089567184448),
 (u'belle', 0.6033458113670349),
 (u'diana', 0.5968631505966187),
 (u'victoria', 0.5925002098083496),
 (u'ling', 0.5852207541465759),
 (u'goddess', 0.5832862257957458),
 (u'mistress', 0.5803508758544922),
 (u'maria', 0.580348014831543)]

In [34]:
model.most_similar("awful")

[(u'terrible', 0.7664473056793213),
 (u'atrocious', 0.7393538355827332),
 (u'horrible', 0.7315835356712341),
 (u'abysmal', 0.728158175945282),
 (u'dreadful', 0.7171405553817749),
 (u'horrendous', 0.6993017196655273),
 (u'appalling', 0.6877903938293457),
 (u'horrid', 0.6677767634391785),
 (u'lousy', 0.6274197697639465),
 (u'amateurish', 0.6212085485458374)]

# PART 3

# From Words To Paragraphs, Attempt 1: Vector Averaging

## Read in Model (if necessary)

In [39]:
#model = Word2Vec.load("300features_40minwords_10context")
model0 = Word2Vec.load("300features_40minwords_10context")


2017-11-12 16:02:08,369 : INFO : loading Word2Vec object from 300features_40minwords_10context
2017-11-12 16:02:08,469 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2017-11-12 16:02:08,470 : INFO : setting ignored attribute syn0norm to None
2017-11-12 16:02:08,471 : INFO : setting ignored attribute cum_table to None
2017-11-12 16:02:08,472 : INFO : loaded 300features_40minwords_10context


In [41]:
model0.init_sims(replace=True)

2017-11-12 16:03:52,787 : INFO : precomputing L2-norms of word weight vectors


In [44]:
type(model.syn0_lockf)

numpy.ndarray

In [49]:
model.syn1neg.shape

(16490, 300)

In [50]:
model["flower"].shape

(300,)

## Create Feature Vectors for each review

In [62]:
def avgFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    idx2word = set(model.wv.index2word)
    
    for word in words:
        if word in idx2word:
            nwords += 1
            featureVec += model[word]
            
    featureVec /= nwords
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    
    reviewFeatureVecs = np.zeros((len(reviews),num_features), dtype = "float32" )
    
    for r in reviews:
        if counter % 1000 == 0:
            print("Processing review {} of {}".format(counter, len(reviews)))
            
        reviewFeatureVecs[counter] = avgFeatureVec(r, model, num_features)
        
        counter += 1
        
    return reviewFeatureVecs

In [64]:
# ****************************************
# Calculate avg feature vectors for both training and test sets
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

Processing review 0 of 25000
Processing review 1000 of 25000
Processing review 2000 of 25000
Processing review 3000 of 25000
Processing review 4000 of 25000
Processing review 5000 of 25000
Processing review 6000 of 25000
Processing review 7000 of 25000
Processing review 8000 of 25000
Processing review 9000 of 25000
Processing review 10000 of 25000
Processing review 11000 of 25000
Processing review 12000 of 25000
Processing review 13000 of 25000
Processing review 14000 of 25000
Processing review 15000 of 25000
Processing review 16000 of 25000
Processing review 17000 of 25000
Processing review 18000 of 25000
Processing review 19000 of 25000
Processing review 20000 of 25000
Processing review 21000 of 25000
Processing review 22000 of 25000
Processing review 23000 of 25000
Processing review 24000 of 25000
Creating average feature vecs for test reviews
Processing review 0 of 25000
Processing review 1000 of 25000
Processing review 2000 of 25000
Processing review 3000 of 25000
Processing revie

In [65]:
forest = RandomForestClassifier( n_estimators = 100, oob_score=True)

forest = forest.fit( trainDataVecs, train["sentiment"])

result = forest.predict(testDataVecs)

output = pd.DataFrame({"id":test["id"], "sentiment":result})
output.to_csv("Word2Vec_AverageVectors.csv", index=False, quoting=3)

In [66]:
forest.oob_score_

0.82235999999999998

# From Words to Paragraphs, Attempt 2: Clustering 

## Creating Clusters

In [70]:
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] / 10
print(num_clusters)

1649


In [71]:
start = time.time()

kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

elapsed = time.time() - start
print("Time Elapsed: {}".format(elapsed))

Time Elapsed: 443.268747091


In [72]:
len(idx)

16490

In [75]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

## Peaking at a couple clusters

In [78]:
word_centroid_df = pd.DataFrame({"wrd":model.wv.index2word, "idx":idx})

In [83]:
word_centroid_df.head()

Unnamed: 0,idx,wrd
0,142,the
1,1023,and
2,1064,a
3,1293,of
4,731,to


In [82]:
word_centroid_df.values[:5,:5]

array([[142, u'the'],
       [1023, u'and'],
       [1064, u'a'],
       [1293, u'of'],
       [731, u'to']], dtype=object)

In [84]:
for c in range(10):
    print("Cluster{}".format(c))
    
    print(word_centroid_df[word_centroid_df["idx"] == c]["wrd"].values)

Cluster0
[u'within']
Cluster1
[u'minus' u'btw' u'mega' u'programme' u'clerks' u'newer' u'caper'
 u'paycheck' u'novelty' u'skit' u'imo' u'gentlemen' u'cheers' u'recipe'
 u'hallmark' u'sleeper' u'sonic' u'bio' u'watcher' u'shocker' u'pixar'
 u'bogus' u'noting' u'mimic' u'queer' u'peanuts' u'thirteen' u'csi'
 u'hound' u'demographic' u'filipino' u'congratulations' u'crop' u'pic'
 u'muppets' u'voyager' u'hrs' u'mockumentary' u'draft' u'marginally'
 u'copying' u'splash' u'slew' u'smallville' u'keystone' u'consisted'
 u'continuation' u'deadwood' u'pauly' u'telly' u'cliffhanger' u'blackadder'
 u'ff' u'sans' u'slot' u'hanna' u'monstrosity' u'scoring' u'clunker'
 u'maury' u'futurama' u'relaxing' u'newest' u'preferably' u'kindergarten'
 u'contender' u'math' u'wb' u'pinhead' u'xxx' u'escapist' u'fondly'
 u'yankees' u'vanilla' u'ranked' u'indy' u'tagline' u'beckham' u'viva'
 u'heavens' u'miracles' u'gong' u'addams' u'groove' u'watchers'
 u'bruckheimer' u'contestant' u'diversion' u'grader' u'thou' u

## create centroid count vectors

In [85]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [86]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

In [87]:
forest = RandomForestClassifier(n_estimators = 100, oob_score = True)

forest.fit(train_centroids, train["sentiment"])

result = forest.predict(test_centroids)

output = pd.DataFrame({"id":test["id"], "sentiment":result})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)

In [88]:
print(forest.oob_score_)

0.83036
