# Word2Vec Movie Review Example

Based on code by Angela Chapman at Kaggle ([here](https://github.com/igorbrigadir/kaggle-word2vec/blob/master/baseline/Word2Vec_AverageVectors.py))

In [1]:
#!/usr/bin/env python

#  Author: Angela Chapman
#  Date: 8/6/2014
#
#  This file contains code to accompany the Kaggle tutorial
#  "Deep learning goes to the movies".  The code in this file
#  is for Parts 2 and 3 of the tutorial, which cover how to
#  train a model using Word2Vec.
#
# *************************************** #

In [16]:
import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np
from tqdm import tqdm
import time

from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility

In [17]:
# ****** Read the two training sets and the test set
#

# Read data from files
train = pd.read_csv( 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3 )
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,  delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (75,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, " \
    "and %d unlabeled reviews\n" % (train["review"].size,
    test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [18]:
train[0:10]

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
time.sleep(0.250)

for review in tqdm(train["review"], position=0):
    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
time.sleep(0.250)

for review in tqdm(unlabeled_train["review"], position=0):
    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

Parsing sentences from training set


100%|██████████| 25000/25000 [01:31<00:00, 271.93it/s]


Parsing sentences from unlabeled set


100%|██████████| 50000/50000 [03:05<00:00, 270.15it/s]


In [None]:
## TRAIN MODEL ###

# ****** Set parameters and train the word2vec model
#
# Import the built-in logging module and configure it so that Word2Vec
# creates nice output messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print "Training Word2Vec model..."
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, seed=1)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more miemory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

In [19]:
#or just load it 
model = Word2Vec.load('300features_40minwords_10context')

In [None]:
model.doesnt_match("man woman child kitchen".split())
model.doesnt_match("damon affleck hanks".split())
model.doesnt_match("france england germany berlin".split())
model.doesnt_match("paris berlin london austria".split())
model.most_similar("man")
model.most_similar("queen")
model.most_similar("awful")
model.most_similar("research")
model.most_similar("affleck")
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

In [26]:
model.similarity("talk", "speak")

0.32553320908663158

In [29]:
model.similarity("talk", "criticize")

0.1396777030401965

In [None]:
# ****** Create average vectors for the training and test sets
#
print "Creating average feature vecs for training reviews"

trainDataVecs = getAvgFeatureVecs( getCleanReviews(train), model, num_features )

print "Creating average feature vecs for test reviews"

testDataVecs = getAvgFeatureVecs( getCleanReviews(test), model, num_features )


# ****** Fit a random forest to the training set, then make predictions
#
# Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier( n_estimators = 100 )

print "Fitting a random forest to labeled training data..."
forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results
result = forest.predict( testDataVecs )

# Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
print "Wrote Word2Vec_AverageVectors.csv"