In [1]:
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import hashlib

from pyspark.mllib.regression import LabeledPoint

# for viewing data samples
pd.options.display.max_rows = 5000

# base directory
DIR = '/Data/testfile_'

In [2]:
def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)


def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    splits = point.split(',')
    fields = [ (i,v) for i,v in enumerate(splits[1:]) ]
    vec = SparseVector(numBuckets, hashFunction(numBuckets, fields))
    return LabeledPoint(splits[0], vec)

In [3]:
# named tuples for code readability 
Files = namedtuple('filelist',['categorical','date','numeric'])
Header = namedtuple('header',['categorical','date','numeric'])
Data = namedtuple('data',['categorical','date','numeric','outcome'])

# declare train file locations 
train_files = Files(DIR+'train_categorical.csv.gz',
                DIR+'train_date.csv.gz',
                DIR+'train_numeric.csv.gz')
test_files = Files(DIR+'test_categorical.csv.gz',
                DIR+'test_date.csv.gz',
                DIR+'test_numeric.csv.gz')

In [4]:
# cache raw data 
train_raw = Data(sc.textFile(train_files.categorical).cache(),
                 sc.textFile(train_files.date).cache(),
                 sc.textFile(train_files.numeric).cache(),None)
test_raw = Data(sc.textFile(test_files.categorical).cache(),
                sc.textFile(test_files.date).cache(),
                sc.textFile(test_files.numeric).cache(),None)

# headers 
get_header = lambda x: list(pd.read_csv(x,nrows=0).columns.values)
train_headers = Header(get_header(train_files.categorical),
                       get_header(train_files.date),
                       get_header(train_files.numeric))
test_headers = Header(get_header(test_files.categorical),
                       get_header(test_files.date),
                       get_header(test_files.numeric))

In [58]:
# filter out first row
remove_header = lambda x: x.split(',')[0]!='Id'


# return outcome from numeric data
get_outcome = lambda (k,v): (k, int(v[-1]))


# split key and fields
id_split = lambda x: (int(x.split(',')[0]),x.split(',')[1:])
def header_key(x, headers):
    ''' split id from fields
        emit header as key
    '''
    id_int = int(x.split(',')[0])
    fields = x.split(',')[1:]
    for i, f in enumerate(fields):
        h = headers[1:][i]
        yield (h, [(id_int, f)])


# gather numeric features
def convert_numeric(x):
    ''' Gather numeric features '''
    k,v = x
    num = float(v[0][1]) if len(v[0][1])>0 else 0
    return (k, [(v[0][0], num)])

In [59]:
# filter header from data and get outcome
train_filtered = Data(
    train_raw.categorical.filter(remove_header),
    train_raw.date.filter(remove_header),
    train_raw.numeric.filter(remove_header),
    train_raw.numeric.filter(remove_header).map(id_split).map(get_outcome).repartition(100)
)
test_filtered = Data(
    test_raw.categorical.filter(remove_header),
    test_raw.date.filter(remove_header),
    test_raw.numeric.filter(remove_header),
    test_raw.numeric.filter(remove_header).map(id_split).map(get_outcome).repartition(100)
)

In [60]:
# group data by column 
train_explode = Data(
    train_filtered.categorical.flatMap(lambda x: header_key(x, train_headers.categorical)).repartition(100),
    train_filtered.date.flatMap(lambda x: header_key(x, train_headers.date)).repartition(100),
    train_filtered.numeric.flatMap(lambda x: header_key(x, train_headers.numeric)).map(convert_numeric).repartition(100),
    train_filtered.outcome
)
test_explode = Data(
    test_filtered.categorical.flatMap(lambda x: header_key(x, train_headers.categorical)).repartition(100),
    test_filtered.date.flatMap(lambda x: header_key(x, train_headers.date)).repartition(100),
    test_filtered.numeric.flatMap(lambda x: header_key(x, train_headers.numeric)).map(convert_numeric).repartition(100),
    test_filtered.outcome
)

In [61]:
# explore categorical columns
def groupby_col(x):
    k, v= x
    colval = v[0][1]
    return ((k,colval),1)
col_counts = train_explode.categorical.filter(lambda (k,x): len(x[0][1])>0).\
                          map(groupby_col).reduceByKey(lambda x,y: x+y).\
                          collect()
col_counts.sort(key=lambda x: x[0][1])
total_count = train_explode.categorical.count()

In [62]:
col_counts[:10]

[(('L3_S32_F3854', u'T-21474819'), 1),
 (('L3_S32_F3854', u'T-2147482816'), 2),
 (('L3_S32_F3854', u'T-2147483648'), 4),
 (('L2_S26_F3104', u'T1'), 1856),
 (('L1_S24_F842', u'T1'), 90),
 (('L0_S22_F577', u'T1'), 1),
 (('L3_S29_F3429', u'T1'), 6606),
 (('L0_S21_F503', u'T1'), 1),
 (('L0_S9_F196', u'T1'), 3),
 (('L1_S25_F2865', u'T1'), 66)]

In [63]:
total_count

21397860

In [66]:
# explore numeric columns
def groupby_col(x):
    k, v= x
    colval = v[0][1]
    return (k, colval)
col_sums = train_explode.numeric.map(groupby_col).reduceByKey(lambda x,y: x+y).collect()
col_totals = train_explode.numeric.map(lambda (k,v): (k,1)).reduceByKey(lambda x,y: x+y).collect()
col_sums.sort(); col_totals.sort()
col_means = [ (x[0], x[1]/y[1]) for x,y in zip(col_sums, col_totals) if x[0]==y[0]]
col_means.sort()

In [82]:
col_means[:10]

[('L0_S0_F0', -0.001087908790879088),
 ('L0_S0_F10', 0.0023501350135013543),
 ('L0_S0_F12', 0.00019881988198819863),
 ('L0_S0_F14', 0.0013297329732973305),
 ('L0_S0_F16', -0.0004527452745274525),
 ('L0_S0_F18', 0.0001511151115111509),
 ('L0_S0_F2', -0.0014882488248824884),
 ('L0_S0_F20', 0.0027981798179817986),
 ('L0_S0_F22', 0.0028061806180618063),
 ('L0_S0_F4', 0.0005809580958095808)]

In [92]:
# calculate baseline model
sum_response = train_explode.numeric.filter(lambda x: x[0]=='Response').map(lambda x: x[1][0][1]).reduce(lambda x,y: x+y)
count_response = train_explode.numeric.filter(lambda x: x[0]=='Response').count()
baseline = [ 1-sum_response*1.0/count_response, sum_response*1.0/count_response ]