In [1]:
# spark functions 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# helper libraries 
from collections import defaultdict
import hashlib
import numpy as np
import datetime as DT

In [3]:
def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)

def parseHashPoint(point, numBuckets, delim):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    splits = point.split(delim)
    fields = [ (i,v) for i,v in enumerate(splits[1:]) ]
    vec = SparseVector(numBuckets, hashFunction(numBuckets, fields))
    return LabeledPoint(splits[0], vec)

def computeLogLoss(p, y):
    """Calculates the value of log loss for a given probabilty and label.

    Note:
        log(0) is undefined, so when p is 0 we need to add a small value (epsilon) to it
        and when p is 1 we need to subtract a small value (epsilon) from it.

    Args:
        p (float): A probabilty between 0 and 1.
        y (int): A label.  Takes on the values 0 and 1.

    Returns:
        float: The log loss value.
    """
    epsilon = 10e-12
    if p==0:
        p+=epsilon
    elif p==1:
        p-=epsilon
    if y==1:
        return -log(p)
    elif y==0:
        return -log(1-p)
    else:
        raise Exception('y not in {0,1}')

def evaluateResults(model, data):
    """Calculates the log loss for the data given the model.

    Args:
        model (LogisticRegressionModel): A trained logistic regression model.
        data (RDD of LabeledPoint): Labels and features for each observation.

    Returns:
        float: Log loss for the data.
    """
    probs = data.map(lambda x: (getP(x.features, model.weights, model.intercept), x.label))
    logloss = probs.map(lambda x: computeLogLoss(x[0],x[1])).reduce(lambda x,y: x+y) / probs.count()
    return logloss

def getP(x, w, intercept):
    """Calculate the probability for an observation given a set of weights and intercept.

    Note:
        We'll bound our raw prediction between 20 and -20 for numerical purposes.

    Args:
        x (SparseVector): A vector with values of 1.0 for features that exist in this
            observation and 0.0 otherwise.
        w (DenseVector): A vector of weights (betas) for the model.
        intercept (float): The model's intercept.

    Returns:
        float: A probability between 0 and 1.
    """
    rawPrediction = x.dot(w)+intercept

    # Bound the raw prediction value
    rawPrediction = min(rawPrediction, 20)
    rawPrediction = max(rawPrediction, -20)
    return 1.0/(1+exp(-rawPrediction))

In [4]:
train_data_location = 's3n://criteo-dataset/rawdata/train'
test_data_location = 's3n://criteo-dataset/rawdata/test'
validation_data_location = 's3n://criteo-dataset/rawdata/validation'

In [None]:
train_data = sc.textFile(train_data_location).cache()
validation_data = sc.textFile(validation_data_location).cache()
test_data = sc.textFile(test_data_location).cache()

In [5]:
hashTrainData = train_data.map(lambda x: parseHashPoint(x, 1000, '\t')).cache()
hashValidationData = validation_data.map(lambda x: parseHashPoint(x, 1000, '\t')).cache()
hashTestData = test_data.map(lambda x: parseHashPoint(x, 1000, '\t')).cache()

In [6]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 0.
regType = 'l2'
includeIntercept = True

starttime = DT.datetime.now()
model0 = LogisticRegressionWithSGD.train( data= hashTrainData
                                         , iterations= numIters
                                         , step= stepSize
                                         , regParam= regParam
                                         , regType= 'l2'
                                         , intercept= includeIntercept )
endtime = DT.datetime.now()
runtime = (endtime-starttime).seconds
runtime_hours = runtime // 3600
runtime_minutes = (runtime % 3600) // 60
runtime_seconds = (runtime % 3600) % 60
print 'job finished in {} hours, {} minutes and {} seconds'.\
        format(runtime_hours, runtime_minutes, runtime_seconds)

job finished in 0 hours, 15 minutes and 5 seconds


#### EMR Setup:
<table style='align: left'>
<tr>
<td><strong>Node type</strong></td>
<td><strong>Count</strong></td>
<td><strong>Intance</strong></td>
</tr>
<tr>
<td>Master</td>
<td>1</td>
<td>r3.xlarge</td>
</tr>
<tr>
<td>Core</td>
<td>7</td>
<td>r3.xlarge</td>
</tr>
</table>

In [9]:
# Compute raw scores on the validation set
predictionAndLabels = hashValidationData.map(lambda lp: (float(model0.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under ROC = 0.651397081537
