In [74]:
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import hashlib

from pyspark.mllib.regression import LabeledPoint

# for viewing data samples
pd.options.display.max_rows = 5000

In [75]:
def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)


def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    splits = point.split(',')
    fields = [ (i,v) for i,v in enumerate(splits[1:]) ]
    vec = SparseVector(numBuckets, hashFunction(numBuckets, fields))
    return LabeledPoint(splits[0], vec)

In [93]:
# named tuples for code readability 
Files = namedtuple('filelist',['categorical','date','numeric'])
Header = namedtuple('header',['categorical','date','numeric'])
Data = namedtuple('data',['categorical','date','numeric','outcome'])

# declare train file locations 
train_files = Files('./Data/train_categorical.csv.gz',
                './Data/train_date.csv.gz',
                './Data/train_numeric.csv.gz')
test_files = Files('./Data/test_categorical.csv.gz',
                './Data/test_date.csv.gz',
                './Data/test_numeric.csv.gz')

In [94]:
# cache raw data 
train_raw = Data(sc.textFile(train_files.categorical).cache(),
                 sc.textFile(train_files.date).cache(),
                 sc.textFile(train_files.numeric).cache(),None)
test_raw = Data(sc.textFile(test_files.categorical).cache(),
                sc.textFile(test_files.date).cache(),
                sc.textFile(test_files.numeric).cache(),None)

# headers 
get_header = lambda x: list(pd.read_csv(x,nrows=0).columns.values)
train_headers = Header(get_header(train_files.categorical),
                       get_header(train_files.date),
                       get_header(train_files.numeric))
test_headers = Header(get_header(test_files.categorical),
                       get_header(test_files.date),
                       get_header(test_files.numeric))

In [114]:
# filter out first row
remove_header = lambda x: x.split(',')[0]!='Id'


# return outcome from numeric data
get_outcome = lambda (k,v): (k, int(v[-1]))


# split key and fields
id_split = lambda x: (int(x.split(',')[0]),x.split(',')[1:])
def key_split(x, headers):
    ''' split id from fields
        emit header 
    '''
    id_int = int(x.split(',')[0])
    fields = x.split(',')[1:]
    for i, f in enumerate(fields):
        h = headers[i]
        yield (h, (id_int,[f]))


# gather numeric features
def convert_numeric(x):
    ''' Gather numeric features '''
    k,v = x
    num = float(v[1][0]) if len(v[1][0])>0 else 0
    return (k, (v[0], [num]))

In [115]:
# filter header from data and get outcome
train_filtered = Data(
    train_raw.categorical.filter(remove_header),
    train_raw.date.filter(remove_header),
    train_raw.numeric.filter(remove_header).map(convert_numeric),
    train_raw.numeric.filter(remove_header).map(id_split).map(get_outcome).repartition(100)
)
test_filtered = Data(
    test_raw.categorical.filter(remove_header),
    test_raw.date.filter(remove_header),
    test_raw.numeric.filter(remove_header).map(convert_numeric),
    test_raw.numeric.filter(remove_header).map(id_split).map(get_outcome).repartition(100)
)


# group data by column 
train_explode = Data(
    train_filtered.categorical.flatMap(lambda x: key_split(x, train_headers.categorical)).repartition(100),
    train_filtered.date.flatMap(lambda x: key_split(x, train_headers.date)).repartition(100),
    train_filtered.numeric.flatMap(lambda x: key_split(x, train_headers.numeric)).map(convert_numeric).repartition(100),
    train_filtered.outcome
)
test_explode = Data(
    test_filtered.categorical.flatMap(lambda x: key_split(x, train_headers.categorical)).repartition(100),
    test_filtered.date.flatMap(lambda x: key_split(x, train_headers.date)).repartition(100),
    test_filtered.numeric.flatMap(lambda x: key_split(x, train_headers.numeric)).map(convert_numeric).repartition(100),
    test_filtered.outcome
)

In [None]:
train_explode.categorical.reduceByKey(lambda x,y: x[1]+y[1]).first()