In [2]:
# Imports
import json, re, numpy as np, numpy.linalg as nplin, matplotlib.pyplot as plt, matplotlib.mlab as mlab, scipy.stats as spstat
from __future__ import division
from pyspark.mllib.feature import HashingTF, IDF, Normalizer
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
from operator import itemgetter
from time import time
%matplotlib inline

In [3]:
# Load data, split into test and training set
all_reviews = sc.textFile("s3n://stat-37601/ratings.json", minPartitions=1000).map(json.loads)
reviews, reviews_test = all_reviews.randomSplit([.7, .3])
reviews.cache()

PythonRDD[4] at RDD at PythonRDD.scala:42

Get the variable we are regressing on: a continious score out of 1

In [4]:
# Get the variable we are regressing on, the review as a score out of 1
def getLabel(review):
    """Get the overall rating from a review"""
    label, total = review["review_overall"].split("/")
    return float(label) / float(total)

labels      = reviews.map(getLabel)
labels_test = reviews_test.map(getLabel)

#### (a) Generating features (hashed TF-IDF)
We generate a list of words (the features) for each review, transform them into hashes, calculate term frequency-inverse document frequency accross corpus, and normalize

In [5]:
# Parser, mostly from earlier problem using SGD on tweets, except without code for emoticions

# words to ignore
stop = set(['the', 'and', 'you', 'your', 'for', 'por', 'que', 'las', 'los', 'les',\
       'una', 'del', 'este', 'usted', 'para', 'con', 'this', 'that', 'was', 'have', 'like',\
       'would', 'could', 'should', 'will', 'can', 'shall', 'just', 'all', 'it', 'its', 'per'])
eng_stop = set(['i', 'me', 'my', 'myself', 'we', 'our', \
             'ours', 'ourselves', 'you', 'your', 'yours', \
             'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', \
             'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', \
             'themselves', 'what', 'which', 'who', 'whom', 'this', \
             'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', \
             'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',\
             'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', \
             'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',\
             'by', 'for', 'with', 'about', 'against', 'between', 'into', \
             'through', 'during', 'before', 'after', \
            'above', 'below', 'to', 'from', 'up', 'down', 'in',\
            'out', 'on', 'off', 'over', 'under', 'again', 'further', \
            'then', 'once', 'here', 'there', 'when', 'where', 'why', \
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',\
            'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', \
            'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', \
            'just', 'dont', 'should', 'now','on'])
spa_stop = set()
all_stop = stop|eng_stop|spa_stop

# word processor function
def splitter(s,ignore=all_stop):
    s = re.sub("([a-zA-Z])'([a-zA-Z])","\g<1>\g<2>",s) # standardize to no apostrophe
    s = re.sub('[^a-zA-Z!\?]',' ',s)           # get rid of most punctuation 
    s = re.sub('\?![\?!]*|!\?[\?!]*',' !? ',s) # standardize ?!?!?!
    s = re.sub('!+','!',s)                    # standardize to single !
    s = re.sub('\?+','?',s)                   # standarize to single ?
    s = re.sub('([a-zA-z]{2,})([?!]+)(\s|$)','\g<1> \g<2> ',s) # single out punctuation
    s = re.sub('(?!http://)www\.\S+|http://\S+','',s) # get rid of urls
    return list([w.lower() for w in s.split() if w not in ignore])

In [6]:
# Do the hashing transform
revHTF = HashingTF(numFeatures=200)
reviewFrequency      = revHTF.transform(     reviews.map(lambda review: splitter(review["review_text"]))).cache()
review_testFrequency = revHTF.transform(reviews_test.map(lambda review: splitter(review["review_text"]))).cache()

# Do the inverse document frequency transform
revIDF = IDF().fit(reviewFrequency)
nor = Normalizer(p=2)
features      = nor.transform(revIDF.transform(reviewFrequency)).cache()
features_test = nor.transform(revIDF.transform(review_testFrequency)).cache()

# Un-cache unneeded data sets
reviewFrequency.unpersist()
review_testFrequency.unpersist()

PythonRDD[6] at RDD at PythonRDD.scala:42

In [7]:
# Join the labels back with the features
data = features.zip(labels).map(lambda (feature, label): LabeledPoint(label, feature))

#### (b) The regression model and evaluation
Write a function to compute MSE for training and test datasets

In [9]:
def treeMSE(tree,train_feat=features,train_label=labels,test_feat=features_test,test_label=labels_test):
    '''Evaluates training and test error for a pyspark mllib tree model'''
    train_MSE = train_label.zip(tree.predict(train_feat)).map(lambda (l,p):(l-p)**2).sum() / train_label.count()
    test_MSE  = test_label.zip(tree.predict(test_feat)).map(lambda (l,p):(l-p)**2).sum() / test_label.count()
    return(train_MSE,test_MSE)

####(c) train and test some trees
Test out some parameterizations of gradient boosted trees and random forrests

In [20]:
# Try with max depth 4 and max bins 25
tree_ct = [10,50,100,200]
model_set1 = []
for i in range(len(tree_ct)):
    model_set1.append(RandomForest.trainRegressor(data=data,categoricalFeaturesInfo={},numTrees=tree_ct[i],impurity='variance',maxDepth=4,maxBins=25))
    trerr,teerr = treeMSE(model_set1[i])
    print 'With %d trees, got %.4f training error MSE and %.4f testing error' % (tree_ct[i],trerr,teerr)

KeyboardInterrupt: 

In [22]:
data.first()

LabeledPoint(0.65, (200,[4,11,13,22,24,25,27,33,50,52,56,58,59,63,64,65,70,80,82,85,89,99,107,108,119,131,134,136,145,147,154,165,180,182,197],[0.141163122096,0.0848617805965,0.194507454629,0.0653701750114,0.206978944929,0.174202784802,0.178482338124,0.31602357641,0.0273267120337,0.0913483562267,0.137811956963,0.112282703109,0.189166311459,0.149165035609,0.09702746755,0.157565939941,0.202648900664,0.182884669945,0.102862297485,0.140775263599,0.146906972512,0.0952654612478,0.185043022887,0.0965378478626,0.155861932571,0.142806504543,0.422373013109,0.15792877968,0.169229685058,0.0856679785379,0.077250348088,0.190830034892,0.171815133163,0.191848073218,0.145055340388]))