##  Kaggle -- Bimbo
Demand prediction for Mexican food company Bimbo stores+clients+products. Will use Spark because training data is somewhat sizable (>3GB uncompressed) and there are quite a lot of combinations for the different Agencies, Channels, Routes, Clients, and Products. 

In [None]:
import os
import sys #current as of 9/26/2015
spark_home = os.environ['SPARK_HOME'] = \
   '/opt/spark16'
    
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

### Load functions and args

#### load functions 

In [None]:
import hashlib
import numpy as np
from math import log, sqrt
from collections import defaultdict
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD


def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)


def parseHashPoint(point, numBuckets, start_i=1, end_i=-5, sep=',', train=True):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.
        sep: The separator of the input features.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    data = point.split(sep)
    features = data[start_i:end_i]
    indexed_features = [(i, feature) for i, feature in enumerate(features)]
    if train:
        label = data[-1]
        return LabeledPoint(label, SparseVector(numBuckets, hashFunction(numBuckets, indexed_features)))
    else:
        return SparseVector(numBuckets, hashFunction(numBuckets, indexed_features))


def predict(x, weights, intercept):
    ''' Make a prediction 
        from x 
        based on model weights
    '''
    prediction = x.dot(weights) + intercept
    return (prediction, x)


def rmsle(preds):
    ''' With predictions
        output RMSLE
    '''
    N = preds.count()
    pred_log_avg_sq_error = preds.map(
        lambda (x,p): (log(p+1)-log(x+1))**2
    ).reduce( 
        lambda x,y: x+y
    ).collect() / N
    return sqrt(pred_log_avg_sq_error)


#### declare args

In [None]:
DATA_DIR = 'Data/'
TRAIN_FILE = 'train.csv.gz'
TEST_FILE = 'test.csv.gz'
PRODUCT_FILE = 'producto_tabla.csv.gz'
CLIENT_FILE = 'cliente_table.csv.gz'
OUTPUT_DIR = 'submission'
NUM_CPUS = 4

### Load data

In [None]:
import re
def skip_header(x):
    if not re.search(r'[A-Za-z]',x):
        return x

train_raw = sc.textFile(DATA_DIR+TRAIN_FILE).filter(skip_header).repartition(NUM_CPUS*2)
train, val = train_raw.randomSplit([0.8,0.2])
train.cache()
val.cache()

test = sc.textFile(DATA_DIR+TEST_FILE).filter(skip_header).repartition(NUM_CPUS*2).cache()

#### create hashed training data

In [None]:
hashTrainData = train.map(
    lambda x: parseHashPoint(x, 2**15)
).cache()

hashValData = val.map(
    lambda x: parseHashPoint(x, 2**15, train=False)
).cache()

### Lasso Linear Regression model

In [None]:
lr_model = LinearRegressionWithSGD.train(
    data=hashTrainData ,
    iterations=30 ,
    step=10.0 ,
    miniBatchFraction=0.75 ,
    regType='l2' ,
    regParam=0.001 ,
    convergenceTol=1e-5 ,
    intercept=True ,
)

In [None]:
preds = hashValData.map(
    lambda x: predict(x , 
                      lr_model.weights , 
                      lr_model.intercept)
)
rm = rmsle(preds)
print 'Val RMSLE: {}'.format(rmsle(preds))

#### predict test data

In [None]:
hashTestData = test.map(lambda x: parseHashPoint(x, 
                                                 2**15, 
                                                 2, 
                                                 None, 
                                                 train=False))

test_preds = hashTestData.map(
    lambda x: predict(x ,
                      lr_model.weights ,
                      lr_model.interept)
).cache()

#### save results to output directory

In [None]:
test_preds.map(lambda x: x[0]).zipWithIndex()\
            .map(lambda x: '{},{}'.format(x[1],x[0]))\
            .saveAsTextFile(DATA_DIR+OUTPUT_DIR)