In [103]:
%%writefile getfiles.sh
!/usr/bin/env bash

sudo mkfs -t ext4 /dev/xvdb
sudo mount /dev/xvdb /Data
sudo chown -R ubuntu:ubuntu /Data
mkdir /Data/tmp
mkdir /Data/tmp/logs
mkdir /Data/tmp/jobs

files=(https://www.dropbox.com/s/ikqguxu91yf8fiz/train_numeric.csv.gz\
        https://www.dropbox.com/s/9kl3m7ssx7uhnda/test_numeric.csv.gz\
        https://www.dropbox.com/s/59vl5rch97v7t6c/test_date.csv.gz\
        https://www.dropbox.com/s/anxa8zqei68uidp/train_date.csv.gz\
        https://www.dropbox.com/s/y6syumln123x4lb/train_categorical.csv.gz\
        https://www.dropbox.com/s/eczq1y1uzzy66el/test_categorical.csv.gz)

cd /Data
for f in ${files[@]}
do
    wget $f
done

files=(train_numeric test_numeric test_date train_date train_categorical test_categorical)
for f in ${files[@]}
do
    cat $f.csv.gz | gzip -d | head -n 10000 | gzip > testfile_$f.csv.gz
    mkdir /Data/$f
    cp $f.csv.gz $f
    cd /Data/$f
    gzip -d $f.csv.gz
    split -l 50000 $f.csv fp_
    rm $f.csv
    cd /Data
done

Overwriting getfiles.sh


In [15]:
%%writefile emr_getheaders.sh
#!/usr/bin/env bash

rm *.csv.gz
files=(https://www.dropbox.com/s/ikqguxu91yf8fiz/train_numeric.csv.gz\
        https://www.dropbox.com/s/9kl3m7ssx7uhnda/test_numeric.csv.gz\
        https://www.dropbox.com/s/59vl5rch97v7t6c/test_date.csv.gz\
        https://www.dropbox.com/s/anxa8zqei68uidp/train_date.csv.gz\
        https://www.dropbox.com/s/y6syumln123x4lb/train_categorical.csv.gz\
        https://www.dropbox.com/s/eczq1y1uzzy66el/test_categorical.csv.gz)
for f in ${files[@]}
do
    wget $f
done

files=(train_numeric test_numeric train_date test_date train_categorical test_categorical)
for f in ${files[@]}
do
    cat $f.csv.gz | gzip -d | head -n 1 > $f.csv
    rm $f.csv.gz
    gzip $f.csv
done

Overwriting emr_getheaders.sh


In [1]:
%%writefile emr_getfiles.sh
#!/usr/bin/env bash
nn1=$1

hadoop distcp s3a://brandonshurick/Data/train_categorical hdfs://$nn1/user/hadoop/train_categorical
hadoop distcp s3a://brandonshurick/Data/train_numeric hdfs://$nn1/user/hadoop/train_numeric
hadoop distcp s3a://brandonshurick/Data/train_date hdfs://$nn1/user/hadoop/train_date

hadoop distcp s3a://brandonshurick/Data/test_categorical hdfs://$nn1/user/hadoop/test_categorical
hadoop distcp s3a://brandonshurick/Data/test_numeric hdfs://$nn1/user/hadoop/test_numeric
hadoop distcp s3a://brandonshurick/Data/test_date hdfs://$nn1/user/hadoop/test_date

Writing emr_getfiles.sh


In [16]:
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import hashlib
from math import log, exp, sqrt

from pyspark.mllib.regression import LabeledPoint

# for viewing data samples
pd.options.display.max_rows = 5000

# base directory
DIR = 'hdfs:///user/hadoop/'
SUFFIX = ''

In [17]:
def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)


def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    splits = point.split(',')
    fields = [ (i,v) for i,v in enumerate(splits[1:]) ]
    vec = SparseVector(numBuckets, hashFunction(numBuckets, fields))
    return LabeledPoint(splits[0], vec)


def getP(x, w, intercept):
    """Calculate the probability for an observation given a set of weights and intercept.

    Note:
        We'll bound our raw prediction between 20 and -20 for numerical purposes.

    Args:
        x (SparseVector): A vector with values of 1.0 for features that exist in this
            observation and 0.0 otherwise.
        w (DenseVector): A vector of weights (betas) for the model.
        intercept (float): The model's intercept.

    Returns:
        float: A probability between 0 and 1.
    """
    rawPrediction = x.dot(w)+intercept

    # Bound the raw prediction value
    rawPrediction = min(rawPrediction, 20)
    rawPrediction = max(rawPrediction, -20)
    return 1.0/(1+exp(-rawPrediction))


def computeLogLoss(p, y):
    """Calculates the value of log loss for a given probabilty and label.

    Note:
        log(0) is undefined, so when p is 0 we need to add a small value (epsilon) to it
        and when p is 1 we need to subtract a small value (epsilon) from it.

    Args:
        p (float): A probabilty between 0 and 1.
        y (int): A label.  Takes on the values 0 and 1.

    Returns:
        float: The log loss value.
    """
    epsilon = 10e-12
    if p==0:
        p+=epsilon
    elif p==1:
        p-=epsilon
    if y==1:
        return -log(p)
    elif y==0:
        return -log(1-p)
    else:
        raise Exception('y not in {0,1}')


def getP(x, w, intercept):
    """Calculate the probability for an observation given a set of weights and intercept.

    Note:
        We'll bound our raw prediction between 20 and -20 for numerical purposes.

    Args:
        x (SparseVector): A vector with values of 1.0 for features that exist in this
            observation and 0.0 otherwise.
        w (DenseVector): A vector of weights (betas) for the model.
        intercept (float): The model's intercept.

    Returns:
        float: A probability between 0 and 1.
    """
    rawPrediction = x.dot(w)+intercept

    # Bound the raw prediction value
    rawPrediction = min(rawPrediction, 20)
    rawPrediction = max(rawPrediction, -20)
    return 1.0/(1+exp(-rawPrediction))


def getMccData(p, x):
    tp, tn, fp, fn = [0]*4
    v = int((p>0.5)*1)
    a = int(x)
    if v==1 and v==a:
        tp = 1
    elif v==0 and v==a:
        tn = 1
    elif v==1 and v!=a:
        fp = 1
    elif v==0 and v!=a:
        fn = 1
    return tp, tn, fp, fn


def getMcc(data_px):
    mccdata = data_px.map(lambda x: getMccData(x[0],x[1]))
    tp = mccdata.map(lambda x: x[0]).sum()
    tn = mccdata.map(lambda x: x[1]).sum()
    fp = mccdata.map(lambda x: x[2]).sum()
    fn = mccdata.map(lambda x: x[3]).sum()
    mcc_num = (tp*tn) - (fp*fn)
    mcc_base = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    try:
        return mcc_num*1.0/mcc_base
    except ZeroDivisionError:
        return 0


def print_scores(model):
    print 'Model log-loss: {}'.format(model.scores.logloss)
    print 'Model accuracy: {}'.format(model.scores.accuracy)
    print 'Model MCC: {}'.format(model.scores.mcc)

In [18]:
# named tuples for code readability 
Files = namedtuple('filelist',['categorical','date','numeric'])
Header = namedtuple('header',['categorical','date','numeric'])
Data = namedtuple('data',['categorical','date','numeric','outcome'])
Model = namedtuple('model',['model','scores'])
Scores = namedtuple('scores',['logloss','accuracy','mcc'])

# declare file locations 
train_files = Files(DIR+'train_categorical'+SUFFIX,
                DIR+'train_date'+SUFFIX,
                DIR+'train_numeric'+SUFFIX)
test_files = Files(DIR+'test_categorical'+SUFFIX,
                DIR+'test_date'+SUFFIX,
                DIR+'test_numeric'+SUFFIX)
train_files_header = Files('train_categorical.csv.gz',
                'train_date.csv.gz',
                'train_numeric.csv.gz')
test_files_header = Files('test_categorical.csv.gz',
                'test_date.csv.gz',
                'test_numeric.csv.gz')

In [19]:
# cache raw data 
train_raw = Data(sc.textFile(train_files.categorical).cache(),
                 sc.textFile(train_files.date).cache(),
                 sc.textFile(train_files.numeric).cache(),None)
test_raw = Data(sc.textFile(test_files.categorical).cache(),
                sc.textFile(test_files.date).cache(),
                sc.textFile(test_files.numeric).cache(),None)

# headers 
get_header = lambda x: list(pd.read_csv(x,nrows=0).columns.values)
train_headers = Header(get_header(train_files_header.categorical),
                       get_header(train_files_header.date),
                       get_header(train_files_header.numeric))
test_headers = Header(get_header(test_files_header.categorical),
                       get_header(test_files_header.date),
                       get_header(test_files_header.numeric))

In [20]:
# filter out first row
remove_header = lambda x: x.split(',')[0]!='Id'


# return outcome from numeric data
get_outcome = lambda (k,v): (k, int(v[-1]))
subtract_outcome = lambda (k,v): (k, v[:-1])


# split key and fields
id_split = lambda x: (int(x.split(',')[0]),x.split(',')[1:])
def header_key(x, headers):
    ''' split id from fields
        emit header as key
    '''
    id_int = x[0]
    fields = x[1]
    for i, f in enumerate(fields):
        h = headers[1:][i]
        yield (h, [(id_int, f)])


# gather numeric features
def convert_numeric(x):
    ''' Gather numeric features '''
    k,v = x
    num = float(v[0][1]) if len(v[0][1])>0 else 0
    return (k, [(v[0][0], num)])

In [21]:
# filter header from data and get outcome
train_filtered = Data(
    train_raw.categorical.filter(remove_header).map(id_split),
    train_raw.date.filter(remove_header).map(id_split),
    train_raw.numeric.filter(remove_header).map(id_split),
    train_raw.numeric.filter(remove_header).map(id_split).map(get_outcome)
)
test_filtered = Data(
    test_raw.categorical.filter(remove_header).map(id_split),
    test_raw.date.filter(remove_header).map(id_split),
    test_raw.numeric.filter(remove_header).map(id_split),
    None #unknown
)

In [22]:
# group data by column 
train_explode = Data(
    train_filtered.categorical.flatMap(lambda x: header_key(x, train_headers.categorical)),
    train_filtered.date.flatMap(lambda x: header_key(x, train_headers.date)),
    train_filtered.numeric.flatMap(lambda x: header_key(x, train_headers.numeric)).map(convert_numeric),
    train_filtered.outcome
)
test_explode = Data(
    test_filtered.categorical.flatMap(lambda x: header_key(x, train_headers.categorical)),
    test_filtered.date.flatMap(lambda x: header_key(x, train_headers.date)),
    test_filtered.numeric.flatMap(lambda x: header_key(x, train_headers.numeric)).map(convert_numeric),
    None #unknown
)

In [23]:
# create key-val RDDs
train_rdd = Data(
    train_filtered.categorical,
    train_filtered.date,
    train_filtered.numeric.map(subtract_outcome),
    train_filtered.outcome
)
test_rdd = Data(
    test_filtered.categorical,
    test_filtered.date,
    test_filtered.numeric,
    None #unknown
)

In [None]:
# explore categorical columns
def groupby_col(x):
    k, v= x
    colval = v[0][1]
    return ((k,colval),1)
col_counts = train_explode.categorical.filter(lambda (k,x): len(x[0][1])>0).\
                          map(groupby_col).reduceByKey(lambda x,y: x+y).\
                          collect()
col_counts.sort(key=lambda x: x[0])
total_count = train_explode.categorical.count()

In [None]:
col_counts[:10]

In [None]:
total_count

In [None]:
# explore numeric columns
def groupby_col(x):
    k, v= x
    colval = v[0][1]
    return (k, colval)
col_sums = train_explode.numeric.map(groupby_col).reduceByKey(lambda x,y: x+y)
col_totals = train_explode.numeric.map(lambda (k,v): (k,1)).reduceByKey(lambda x,y: x+y)
col_totals = col_sums.join(col_totals).collect()
col_means = [ (x, y[0]/y[1]) for x,y in col_totals ]
col_means.sort()

In [None]:
col_means[:10]

In [None]:
# look at first field
pd.DataFrame(zip(train_headers.numeric[1:], 
                 train_rdd.numeric.first()[1]),
             columns=['Field','NumVal']).head(15)

In [None]:
# calculate baseline model
sum_response = train_explode.numeric.filter(lambda x: x[0]=='Response').\
                                     map(lambda x: x[1][0][1]).\
                                     reduce(lambda x,y: x+y)
count_response = train_explode.numeric.filter(lambda x: x[0]=='Response').count()
baseline = sum_response*1.0/count_response
logloss = train_filtered.outcome.map(lambda x: computeLogLoss(baseline, x[1])).sum() / train_filtered.outcome.count()
accuracy = 1-baseline
mcc = getMcc(train_filtered.outcome.map(lambda x: (baseline, x[1])))

In [None]:
print 'Baseline log-loss: {}'.format(logloss)
print 'Baseline accuracy: {}'.format(accuracy)
print 'Baseline MCC: {}'.format(mcc)

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
def convertPointNumeric(x):
    k, v = x
    v1, response = v
    features = []
    for v_i in v1:
        if len(v_i)>0:
            features.append(float(v_i))
        else:
            features.append(0)
    return LabeledPoint(response, features)
    
data = train_rdd.numeric.join(train_rdd.outcome).map(convertPointNumeric)
training, val = data.randomSplit([0.7, 0.3])
train_params = {
    'iterations':300, 
    'initialWeights':[1]*len(training.first().features), 
    'regParam':0.001, 
    'regType':'l2', 
    'intercept':True, 
    'validateData':False, 
    'corrections':10, 
    'tolerance':1e-06, 
    'numClasses':2
}
m = LogisticRegressionWithLBFGS().train(training, **train_params)
p = val.map(lambda x: (getP(x.features, m.weights, m.intercept), x.label))
s = Scores(
    p.map(lambda x: computeLogLoss(x[0], x[1])).sum() / p.count(),
    p.map(lambda x: int(x[0]>0.5)==int(x[1])).sum()*1.0 / p.count(),
    getMcc(p)
)
model1 = Model(m, s)

In [None]:
print_scores(model1)

In [None]:
from pyspark.mllib.tree import GradientBoostedTrees
data = train_rdd.numeric.join(train_rdd.outcome).map(convertPointNumeric)
training, val = data.randomSplit([0.7, 0.3])
gb_params = {
    'categoricalFeaturesInfo':{},
    'numIterations':300,
    'loss':'leastSquaresError', #logLoss
    'learningRate':0.1, 
    'maxDepth':3, 
    'maxBins':32,
}
m = GradientBoostedTrees.trainClassifier(training, **gb_params)
p = m.predict(val.map(lambda x: x.features))
lp = val.map(lambda lp: lp.label).zip(p).map(lambda x: (x[1],x[0]))
s = Scores(
    lp.map(lambda x: computeLogLoss(x[0], x[1])).sum() / lp.count(),
    lp.map(lambda x: int(x[0]>0.5)==int(x[1])).sum()*1.0 / lp.count(),
    getMcc(lp)
)
model2 = Model(m, s)

In [None]:
print_scores(model2)