# DrivenData - Warm Up: Predict Blood Donations
### 1st Entry
#### Best Score: 0.4452
#### Current Rank: 165/1816 or 91st percentile



I use log loss: $$Log Loss = \frac{1}{N} \sum_{i=1}^N [y_{i} log , p_{i} + (1 - y_{i}) log , (1 - p_{i})]$$ to evaluate my classifier.

$N =$ number of samples

$y_{i} =$ binary indicator of whether a donation was made in March 2007 (0 = Did Not Make Donation, 1 = Made Donation)

$p_{i} =$ probability of assigning sample i to the correct class


Log Loss heavily penalises classifiers that are confident about an incorrect classification



In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, neighbors, svm, metrics, tree, linear_model, grid_search, ensemble, naive_bayes
import matplotlib.pyplot as plt

df = pd.read_csv('9db113a1-cdbe-4b1c-98c2-11590f124dd8.csv', index_col = 0)
print(df.head())

X = np.array(df.drop(['Made Donation in March 2007'], 1))
print(X[:5])

y = np.array(df['Made Donation in March 2007'])
print(y[:5])

     Months since Last Donation  Number of Donations  \
619                           2                   50   
664                           0                   13   
441                           1                   16   
160                           2                   20   
358                           1                   24   

     Total Volume Donated (c.c.)  Months since First Donation  \
619                        12500                           98   
664                         3250                           28   
441                         4000                           35   
160                         5000                           45   
358                         6000                           77   

     Made Donation in March 2007  
619                            1  
664                            1  
441                            1  
160                            1  
358                            0  
[[    2    50 12500    98]
 [    0    13  3250    28]
 [    1 

In [2]:
n = len(X)

knn = neighbors.KNeighborsClassifier(n_neighbors = 30, n_jobs=-1)
logit = linear_model.LogisticRegression(n_jobs = -1)
dt = tree.DecisionTreeClassifier()
rf = ensemble.RandomForestClassifier(n_jobs = -1)
gnb = naive_bayes.GaussianNB()

In [3]:
def classifier(alg, features, y, k=1):
    knn = neighbors.KNeighborsClassifier(n_neighbors = k, n_jobs = -1)

    loocv = cross_validation.LeaveOneOut(n) # Create n training and test sets
    kfcv = cross_validation.KFold(n, n_folds = 10)
    error_rates = [] # Initialize vector of error rate for each iteration of CV
    probs = np.empty([0,2]) # Initialize nx2 matrix of prediction probabilities
    for train, test in kfcv:
        alg.fit(X[train,:], y[train]) # fit algorithm on training set
        error = 1 - alg.score(X[test,:], y[test]) # Find the error on the test set
        error_rates.append(error) # Create n-vector of error rates
        alg_train_pp = alg.predict_proba(X[test,:]) # Calculate prediction probabilities
        probs = np.append(probs, alg_train_pp, axis = 0) # Create nx2 matrix of prediction probabilities
    #gen_error_rate = sum(error_rates) / n # Gen. error is mean of n error rates in CV
    #print('Generalization Error =', gen_error_rate*100, '%')
    log_loss = metrics.log_loss(y, probs) # Calculate log-loss
    print('Log Loss Score =', log_loss)
    
    #ROC Curve
    #fpr, tpr, thresholds = metrics.roc_curve(y, probs[:,1])
    #roc_auc = metrics.auc(fpr, tpr)
    
    % matplotlib inline
    #fig = plt.plot(fpr, tpr, label='AUC = %0.3f' % roc_auc)
    #plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    #plt.xlim([0, 1])
    #plt.ylim([0, 1])
    #plt.xlabel('False Positive Rate or (1 - Specificity)')
    #plt.ylabel('True Positive Rate or (Sensitivity)')
    #plt.title('Receiver Operating Characteristic')
    #plt.legend(loc="lower right")
    #plt.show()
    
    #return fig

In [4]:
classifier(logit, X, y)
classifier(knn, X, y, 40)
classifier(rf, X, y)

Log Loss Score = 0.49135502008
Log Loss Score = 0.541184301552
Log Loss Score = 1.59332566334


In [5]:
df2 = pd.read_csv('5c9fa979-5a84-45d6-93b9-543d1a0efc41.csv', index_col = 0)
X2 = np.array(df2)

probability = logit.predict_proba(X2)
p = []
for i in range(len(X2)):
    p.append(probability[i,1])

p = pd.DataFrame(p)
p['index'] = df2.index
p = p[['index', 0]]
p.columns = ['', 'Made Donation in March 2007']

p.to_csv('prediction.csv', index=False)