##  Kaggle -- Bimbo
Demand prediction for Mexican food company Bimbo stores+clients+products. Will use Spark because training data is somewhat sizable (>3GB uncompressed) and there are quite a lot of combinations for the different Agencies, Channels, Routes, Clients, and Products. 

### Load functions and args

In [None]:
import hashlib
import numpy as np
from math import log, sqrt
from collections import defaultdict
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD


def parsePoint(point, start_i, end_i, sep=','):
    """Converts a comma separated string into a list of (featureID, value) tuples.

    Note:
        featureIDs should start at 0 and increase to the number of features - 1.

    Args:
        point (str): A comma separated string where the first value is the label and the rest
            are features.

    Returns:
        list: A list of (featureID, value) tuples.
    """
    fields = [ (i,v) for i,v in enumerate(point.split(sep)[start_i:end_i]) ]
    return fields


def createOneHotDict(inputData):
    """Creates a one-hot-encoder dictionary based on the input data.

    Args:
        inputData (RDD of lists of (int, str)): An RDD of observations where each observation is
            made up of a list of (featureID, value) tuples.

    Returns:
        dict: A dictionary where the keys are (featureID, value) tuples and map to values that are
            unique integers.
    """
    distinctFeatures = inputData.flatMap(lambda x: x).distinct()
    outputDict = distinctFeatures.zipWithIndex().collectAsMap()
    return outputDict


def oneHotEncoding(rawFeats, OHEDict, numOHEFeats):
    """Produce a one-hot-encoding from a list of features and an OHE dictionary.

    Note:
        If a (featureID, value) tuple doesn't have a corresponding key in OHEDict it should be
        ignored.

    Args:
        rawFeats (list of (int, str)): The features corresponding to a single observation.  Each
            feature consists of a tuple of featureID and the feature's value. (e.g. sampleOne)
        OHEDict (dict): A mapping of (featureID, value) to unique integer.
        numOHEFeats (int): The total number of unique OHE features (combinations of featureID and
            value).

    Returns:
        SparseVector: A SparseVector of length numOHEFeats with indicies equal to the unique
            identifiers for the (featureID, value) combinations that occur in the observation and
            with values equal to 1.0.
    """
    ref = {}
    for k in rawFeats:
        if k in OHEDict:
            ref.update({OHEDict[k]:1})
    return SparseVector(numOHEFeats, ref)


def parseOHEPoint(point, OHEDict, numOHEFeats,  start_i=1, end_i=-5, sep=',', train=True):
    """Obtain the label and feature vector for this raw observation.

    Note:
        You must use the function `oneHotEncoding` in this implementation or later portions
        of this lab may not function as expected.

    Args:
        point (str): A comma separated string where the first value is the label and the rest
            are features.
        OHEDict (dict of (int, str) to int): Mapping of (featureID, value) to unique integer.
        numOHEFeats (int): The number of unique features in the training dataset.

    Returns:
        LabeledPoint: Contains the label for the observation and the one-hot-encoding of the
            raw features based on the provided OHE dictionary.
    """
    splits = point.split(sep)
    fields = [ (i,v) for i,v in enumerate(splits[start_i:end_i]) ]
    if train:
        label = splits[-1] if splits[-1]>0 else 0
        lp = LabeledPoint(label, oneHotEncoding(fields, OHEDict, numOHEFeats))
    else:
        lp = oneHotEncoding(fields, OHEDict, numOHEFeats)
    return lp


def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)


def parseHashPoint(point, numBuckets, start_i=1, end_i=-5, sep=',', train=True):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.
        sep: The separator of the input features.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    data = point.split(sep)
    features = data[start_i:end_i]
    indexed_features = [(i, feature) for i, feature in enumerate(features)]
    if train:
        label = data[-1] if data[-1]>0 else 0
        return LabeledPoint(label, SparseVector(numBuckets, hashFunction(numBuckets, indexed_features)))
    else:
        return SparseVector(numBuckets, hashFunction(numBuckets, indexed_features))


def predict(x, weights, intercept):
    ''' Make a prediction 
        from x 
        based on model weights
    '''
    prediction = x.features.dot(weights) + intercept
    return (prediction, x.label)


def rmsle(preds):
    ''' With predictions
        output RMSLE
    '''
    N = preds.count()
    pred_log_avg_sq_error = preds.map(
        lambda (p,x): (log(p+1)-log(x+1))**2
    ).reduce( 
        lambda x,y: x+y
    ) / N
    return sqrt(pred_log_avg_sq_error)


#### declare args

In [None]:
DATA_DIR = '/Data/'
TRAIN_FILE = 'train.csv.gz'
TEST_FILE = 'test.csv.gz'
PRODUCT_FILE = 'producto_tabla.csv.gz'
CLIENT_FILE = 'cliente_table.csv.gz'
OUTPUT_DIR = 'submission'
NUM_CPUS = 6

### Load data

In [None]:
import re
def skip_header(x):
    if not re.search(r'[A-Za-z]',x):
        return x

train_raw = sc.textFile(DATA_DIR+TRAIN_FILE).filter(skip_header).repartition(NUM_CPUS*2)
train, val = train_raw.randomSplit([0.8,0.2])
train.cache()
val.cache()

test = sc.textFile(DATA_DIR+TEST_FILE).filter(skip_header).repartition(NUM_CPUS*2).cache()

#### create OHE dict from training data

In [None]:
parsedTrainData = train.map(
    lambda x: parsePoint(x, 1, -5)
)

oheDict = createOneHotDict(parsedTrainData)
numOHEFeats = len(oheDict.keys())

oheTrainData = train.map(lambda point: parseOHEPoint(point, oheDict, numOHEFeats))
oheValData = val.map(lambda point: parseOHEPoint(point, oheDict, numOHEFeats))


### Ridge Linear Regression model

step_sizes = [ 1, 0.1, 1e-2 ]
regParams = [ 10, 1, 0.1, 1e-2 ]
results = []

for s in step_sizes:
    for r in regParams:
        lr_model = LinearRegressionWithSGD.train(
            data=oheTrainData ,
            iterations=30 ,
            step=s ,
            regType='l2' ,
            regParam=r ,
            convergenceTol=1e-3 ,
            intercept=True ,
            initialWeights=np.ones(numOHEFeats)
        )
        preds = oheValData.map(
            lambda x: predict(x , 
                              lr_model.weights , 
                              lr_model.intercept)
        )
        rm = rmsle(preds)
        results.append({'step_size':s,'regParam':r,'result':rm})

print results


In [None]:
s = 0.1  # from grid search
r = 1    # from grid search
lr_model = LinearRegressionWithSGD.train(
    data=oheTrainData ,
    iterations=600 ,
    step=s ,
    regType='l2' ,
    regParam=r ,
    convergenceTol=1e-3 ,
    intercept=True ,
    initialWeights=np.ones(numOHEFeats)
)
preds = oheValData.map(
    lambda x: predict(x , 
                      lr_model.weights , 
                      lr_model.intercept)
)
rm = rmsle(preds)
print 'Model RMSLE: {}'.format(rm)

#### predict test data

In [None]:
oheTestData = test.map(lambda point: parseOHEPoint(point, 
                                                   oheDict, 
                                                   numOHEFeats,
                                                   2,
                                                   None,
                                                   train=False))

test_preds = oheTestData.map(
    lambda x: predict(x ,
                      lr_model.weights ,
                      lr_model.interept)
).cache()

#### save results to output directory

In [None]:
test_preds.map(lambda x: x[0]).zipWithIndex()\
            .map(lambda x: '{},{}'.format(x[1],x[0]))\
            .saveAsTextFile(DATA_DIR+OUTPUT_DIR)

### Random Forest model

In [None]:
from pyspark.mllib.tree import RandomForest

In [None]:
rf_model = RandomForest.trainRegressor(
    data=oheTrainData,
    categoricalFeaturesInfo={},
    numTrees=10,
    featureSubsetStrategy='onethird',
    maxDepth=6,
    seed=22,
)
preds = oheValData.map(
    lambda x: rf_model.predict(x)
)
rm = rmsle(preds)
print 'Model RMSLE: {}'.format(rm)