# Predict Blood Donations

https://www.drivendata.org/competitions/2/warm-up-predict-blood-donations/

In [None]:
import pandas as pd
import numpy as np

# Jupyter Specific
%matplotlib inline
from IPython.display import display

# Import plotly and enable offline mode
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

# Machine Learning Library 
from theano.sandbox import cuda
import theano
import keras
# from keras import backend as K
# from keras.utils.data_utils import get_file
# from keras.utils import np_utils
# from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
# from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
# from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
# from keras.utils.layer_utils import layer_from_config
# from keras.metrics import categorical_crossentropy, categorical_accuracy
# from keras.layers.convolutional import *
# from keras.preprocessing import image, sequence
# from keras.preprocessing.text import Tokenizer

# Grab Data

In [None]:
dataDir = './data/BloodDonations/'
%mkdir -p $dataDir

dataDefintions = [
    ('trainingData.csv', 'https://s3.amazonaws.com/drivendata/data/2/public/9db113a1-cdbe-4b1c-98c2-11590f124dd8.csv'),
    ('testData.csv', 'https://s3.amazonaws.com/drivendata/data/2/public/5c9fa979-5a84-45d6-93b9-543d1a0efc41.csv'),
    #('sampleSubmission', 'https://s3.amazonaws.com/drivendata/data/2/public/BloodDonationSubmissionFormat.csv')
]

def getFromCSV(filename, url, cacheDir = ''):
    '''Download and cache CSV file
    Params:
        dataDef: tuple of filename and url 
    '''    
    import os
    cachePath = cacheDir + filename
    if not os.path.isfile(cachePath):
        import requests
        r = requests.get(url)
        with open(cachePath, "wb") as f:
            f.write(r.content)
        print('Downloaded {} from {}'.format(filename, url))
    else:
        print('Loaded {} from cache'.format(filename))
     
    return pd.read_csv(cachePath)

#for filename, url in dataDefintions:
#    df = getCSV(filename, url, dataDir)
trainingData = getFromCSV(dataDefintions[0][0], dataDefintions[0][1], dataDir)
testData = getFromCSV(dataDefintions[1][0], dataDefintions[1][1], dataDir)
len(trainingData)

In [None]:
len(testData)

In [None]:
trainingData.head()

# Exploring Training Data

Use information about each donor's history
 * Months since Last Donation: this is the number of monthis since this donor's most recent donation.
 * Number of Donations: this is the total number of donations that the donor has made.
 * Total Volume Donated: this is the total amound of blood that the donor has donated in cubuc centimeters.
 * Months since First Donation: this is the number of months since the donor's first donation.

In [None]:
scatterPlot('Number of Donations', 'Months since First Donation')

# Setup Data for Model

In [None]:
def createValidation(data, prop = 0.8):
    msk = np.random.rand(len(data)) < prop
    trn = data[msk]
    val = data[~msk]
    return (trn, val)

def createModelData(data):
    
    #Grab target label if it exists
    if 'Made Donation in March 2007' in data.columns:
        out = data['Made Donation in March 2007'].as_matrix()
        inp = data.drop(['Unnamed: 0','Total Volume Donated (c.c.)', 'Made Donation in March 2007'],axis=1).as_matrix()
    else:
        out = None
        inp = data.drop(['Unnamed: 0', 'Total Volume Donated (c.c.)'],axis=1).as_matrix()
    
    return (inp, out)

def setupData(trainingData, testData):    
    training, valid = createValidation(trainingData)
    
    trn.x, trn.y = createModelData(training)
    val.x, val.y = createModelData(valid)
    test, _ = createModelData(testData)
    return (trn, val, test)
(trn, val, test) = setupData(trainingData, testData)
display(trn.x, trn.y)
# display(val.x, val.y)
trn.x.shape


# Linear Model

In [None]:
linearModel = Sequential([
        BatchNormalization(input_shape=(3,)),
        Dense(1),
        Activation('sigmoid')
        ])

linearModel.compile(Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
def trainModel(model):
    model.fit(trn.x, trn.y, batch_size=64, nb_epoch=10, validation_data=(val.x, val.y))
# Perform 1 iteration at low lr
linearModel.optimizer.lr = 0.01
trainModel(linearModel)

,Made Donation in March 2007
659,0.5
276,0.5
263,0.5
303,0.5
83,0.5
500,0.5
530,0.5
244,0.5

In [None]:
def createSubmission(model):
    pred = np.squeeze(model.predict(test))
    ids = testData['Unnamed: 0'].as_matrix()
    return np.stack([ids, pred], axis=1)

def outputSubmission(filename, subm):
    np.savetxt(filename, subm, fmt='%d,%.5f', header=',Made Donation in March 2007', comments='')

submissionFile = dataDir + 'submissionLinear.csv'
outputSubmission(submissionFile, createSubmission(linearModel) )
# subm = createSubmission(linearModel)
# np.savetxt(submissionFile, subm, fmt='%d,%.5f', header=',Made Donation in March 2007', comments='')

### DriveData Evaluation

DriveData uses binary log loss defined as:

$$\textrm{LogLoss} = - \frac{1}{n} \sum_{i=1}^n \left[ y_i \log(h_i) + (1 - y_i) \log(1 - h_i)\right]$$
- $n$ is the number of samples in the test set
- $h_i$  is the predicted label
- $y_i$ is the true label
- $log()$ is the natural (base e) logarithm

As shown in the plot below, there is a "infinte" penality for predicting the wrong label with high confidence, i.e. predicting 0 when it should be 1. A trick to improve kaggle score is to clip the confident predictions.

The clipping amount is random 

# Lets Create an ensamble model

In [None]:
def ensambleModel():
    model =  Sequential([
        BatchNormalization(input_shape=(11,)),
        Dense(100),
        Activation('relu'),
        Dense(30),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
        ])
    model.compile(Adam(decay=decay_rate), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(trn.x, trn.y, batch_size=256, nb_epoch=epochs, validation_data=(val.x, val.y)) 
    return model 


models = [ensambleModel() for i in range(10)]

In [None]:
#Lets get the logloss for the ensamble model on the validation set
def ensamblePrediction(models, inp):
    avgPred = np.array([np.squeeze(models[i].predict(inp)) for i in range(10)])
    return avgPred.mean(axis=0)

pred = ensamblePrediction(models, val.x)
logLoss(val.y, pred)

Not as good as expected, but again perhaps a few points better

# Lets add validation data to set and train

In [None]:
def ensambleNoCVModel():
    x = np.vstack((trn.x, val.x))
    y = np.concatenate((trn.y, val.y)) #Different methods as x is 2d vs y is 1d!
    
    model =  Sequential([
        BatchNormalization(input_shape=(11,)),
        Dense(100, activation='relu'),
        Dense(30, activation='relu'),
        Dense(1, activation='sigmoid'),
        ])
    
    epochs = 300
    learning_rate = 0.01
    decay_rate = learning_rate / epochs
    
    model.compile(Adam(decay=decay_rate), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(x, y, batch_size=256, nb_epoch=epochs, validation_data=(val.x, val.y)) 
    
    return model 


models = [ensambleNoCVModel() for i in range(10)]
pred = ensamblePrediction(models,val.x)
logLoss(val.y, pred)

test

In [None]:
def outputSubmission(filename, pred):
    ids = testData['Unnamed: 0'].as_matrix()
    subm = np.stack([ids, pred], axis=1)
    np.savetxt(filename, subm, fmt='%d,%.5f', header=',Made Donation in March 2007', comments='')

outputSubmission(dataDir+'ensambleNoCVModel', ensamblePrediction(models, test))

# Semi-Supervised Learning

Lets use the ensamble modle to predict the label of the test set and using in training. This works as the test set is about 25% of the training set. we should shuffle the data so that batches don't purely contain test set

In [None]:
#If we guess 0.5 throughout the loss would be:
logloss.subs([(y,1), (h, 0.5)])

In [None]:
def logLoss(y, h):
    l = -np.sum(y*np.log(h) + (1-y)*np.log(1-h)) / len(h)
    return l

logLoss(val.y, np.squeeze(linearModel.predict(val.x)))

# Simple Neural Network

Exploring various network architectures

| Model | Val Accuracy |
|------ |------|
|20, 20 | 0.8151|
|10     | 0.8151|
|32     | 0.8319|
|32,32  | 0.8103|
|100    | 0.7847|
|3      | 0.7414|

Tried adding total volume of donation, but as expected no improvement

Overall network architecture doesn't seem to have an impact. So lets keep it simple.

With feature engineering, a large network seems to work a litte bitter, can be trained easier.


In [None]:
NNModel = Sequential([
        BatchNormalization(input_shape=(3,)),
        Dense(100),
        Activation('relu'),
        Dense(30),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
        ])

epochs = 300
learning_rate = 0.01
decay_rate = learning_rate / epochs

NNModel.compile(Adam(decay=decay_rate), loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
NNModel.fit(trn.x, trn.y, batch_size=256, nb_epoch=epochs, validation_data=(val.x, val.y)) 

In [None]:
trn.x

In [None]:
logLoss(val.y, np.squeeze(NNModel.predict(val.x)))

In [None]:
submissionFile = dataDir + 'submissionNN.csv'
outputSubmission(submissionFile, createSubmission(NNModel) )

# Lets try some feature engineering

## Remove Skewness in data

In [None]:
trainingData.plot.hist(alpha=0.5)

In [None]:
trainingData['Number of Donations'].plot.hist()

In [None]:
trainingData['Months since Last Donation'].plot.hist()

In [None]:
trainingData['Made Donation in March 2007'].plot.hist()

We have a lot more examples of people who didn't donate to people who donated.

In [None]:
(trainingData['Total Volume Donated (c.c.)'] / trainingData['Number of Donations']).plot.hist()

Everyone donates 250cc per donation!, So it isn't an instresting variable. 

In [None]:
def scatterPlot(xLabel, yLabel):    
    trace = go.Scatter(
        x=trainingData[xLabel], 
        y=trainingData[yLabel], 
        mode='markers',
        marker=dict(
            size='8',
            color = trainingData['Made Donation in March 2007'], #set color equal to a variable
            colorscale='RdBu',
            showscale=True
        ),
        )

    layout= go.Layout(
        title= 'Blood Donations',
        hovermode= 'closest',
        xaxis= dict(
            title= xLabel,
            zeroline= False,
        ),
        yaxis=dict(
            title= yLabel,
        ),
        showlegend=False
    )
    fig= go.Figure(data=[trace], layout=layout)
    py.iplot(fig)

In [None]:
scatterPlot('Number of Donations', 'Months since Last Donation')

Seems like months since last donation is a better indication than number of donations