### Step 1: Import Libraries

In [408]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np

### Step 2: Load the CSV as a Pandas Dataframe

In [409]:
df = pd.read_csv('data/training_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


### Step 3: Rename columns for ease of assignment and manipulation

In [410]:
df=df.rename(columns = {'Unnamed: 0':'id'})
df=df.rename(columns = {'Months since Last Donation':'time_since_last'})
df=df.rename(columns = {'Number of Donations':'num_donations'})
df=df.rename(columns = {'Total Volume Donated (c.c.)':'volume'})
df=df.rename(columns = {'Months since First Donation':'time_since_first'})
df=df.rename(columns = {'Made Donation in March 2007':'donated'})
df.head()

Unnamed: 0,id,time_since_last,num_donations,volume,time_since_first,donated
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


### Step 4: Removing the Volume column due to multicollinearity

In [411]:
df.drop('volume', axis=1, inplace=True)
df.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first,donated
0,619,2,50,98,1
1,664,0,13,28,1
2,441,1,16,35,1
3,160,2,20,45,1
4,358,1,24,77,0


## Approach A: Use KNN on the data with neither feature engineering nor transformation

### Step A1: Setup the Dependent and Independent variables

In [412]:
X = df[ ['time_since_last','num_donations', 'time_since_first'] ].values
y = df['donated'].values

### Step A2: Split the training data into train and validation sets

In [413]:
from sklearn.cross_validation import train_test_split

seed = 8
test_size = 0.2
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)

#X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)

print(X_train.shape)
print(X_valid.shape)

(460, 3)
(116, 3)


### Step A3: Obtaining the best accuracy score for the Validation Set using KNN

In [414]:
acc = []
best_score = 0
best_lm = 2
for k in range(2,20, 1):
    clf = KNeighborsClassifier(k)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_valid)
    score = accuracy_score(y_valid, y_hat )
    acc.append(score)
    if score > best_score:
        best_lm = k
        best_score = score
        print("best accuracy score %s"% (score))
        print(best_lm)
    #print("k: %s accuracy: %0.4f missed %s"% (k, score, np.count_nonzero(y_hat - y_test))   )

best accuracy score 0.73275862069
2
best accuracy score 0.75
3
best accuracy score 0.775862068966
4
best accuracy score 0.793103448276
5


### Step A4: Rename columns on the "TEST" dataset for ease of assignment and manipulation

In [415]:

df_test = pd.read_csv('data/test_dataset.csv')
df_test=df_test.rename(columns = {'Unnamed: 0':'id'})
df_test=df_test.rename(columns = {'Months since Last Donation':'time_since_last'})
df_test=df_test.rename(columns = {'Number of Donations':'num_donations'})
df_test=df_test.rename(columns = {'Total Volume Donated (c.c.)':'volume'})
df_test=df_test.rename(columns = {'Months since First Donation':'time_since_first'})
df_test=df_test.rename(columns = {'Made Donation in March 2007':'donated'})
df_test.head()

Unnamed: 0,id,time_since_last,num_donations,volume,time_since_first
0,659,2,12,3000,52
1,276,21,7,1750,38
2,263,4,1,250,4
3,303,11,11,2750,38
4,83,4,12,3000,34


### Step A5: Removing the Volume colum on the "TEST" dataset due to multicollinearity

In [416]:
df_test.drop('volume', axis=1, inplace=True)
df_test.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first
0,659,2,12,52
1,276,21,7,38
2,263,4,1,4
3,303,11,11,38
4,83,4,12,34


### Step A6: Predicting on the "test" data

In [417]:
X_test = df_test[ ['time_since_last','num_donations', 'time_since_first'] ].values

#From the validation data
k = 7 
clf = KNeighborsClassifier(k)
clf.fit(X_train, y_train)
y_hat= clf.predict(X_test)

### Step A7: Writing the predictions to the CSV file

In [418]:
#df_test.id
#y_hat
y2 = y_hat.reshape(-1,1)
dfy2 = pd.DataFrame({'donated':y2[:,0],'': df_test['id']})
dfy2.head()

Unnamed: 0,Unnamed: 1,donated
0,659,0
1,276,0
2,263,0
3,303,0
4,83,1


In [419]:
import csv
dfy2.to_csv('BloodDonationSubmissionFormat.csv', index=False)

## Approach B: Feature Engineering and Transforming the Data


In [420]:
df.max()

id                  747
time_since_last      74
num_donations        50
time_since_first     98
donated               1
dtype: int64

###  Step B1: Since there are item with values of 0, the log values will be NaN. One acceptable way to add an infinitessimal number to the zeros to the Training Dataset.

In [421]:
from scipy import stats
small = 0.0001
df['time_since_last'].replace([0], [small],inplace=True)
df['num_donations'].replace([0], [small],inplace=True)
df['time_since_first'].replace([0], [small],inplace=True)
#df['donated'].replace([0], [small],inplace=True)
#df.replace([0], [small],inplace=True)
df.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first,donated
0,619,2.0,50,98,1
1,664,0.0001,13,28,1
2,441,1.0,16,35,1
3,160,2.0,20,45,1
4,358,1.0,24,77,0


### Step B2: add an infinitessimal number to the zeros to the Test Dataset.

In [422]:
# Add an infinitessimal number to the zeros to the Training Dataset
df_test['time_since_last'].replace([0], [small],inplace=True)
df_test['num_donations'].replace([0], [small],inplace=True)
df_test['time_since_first'].replace([0], [small],inplace=True)
#df_test['donated'].replace([0], [small],inplace=True)
#df.replace([0], [small],inplace=True)
df_test.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first
0,659,2.0,12,52
1,276,21.0,7,38
2,263,4.0,1,4
3,303,11.0,11,38
4,83,4.0,12,34


### Step B3: BoxCox Transformation for "Time Since Last Donation" on the Training Dataset

In [423]:
samples = df.time_since_last
df['BoxCoxTimeSinceLast'] = pd.DataFrame( data = samples, columns = ['Time Since Last']  )
boxcox_samples = stats.boxcox(samples.values)[0]
lambd = stats.boxcox(samples.values)[1]
df['BoxCoxTimeSinceLast'] = boxcox_samples
lambd

0.29660702156034996

### Step B4: BoxCox Transformation for "Time Since Last Donation" on the Test Dataset

In [424]:
samples = df_test.time_since_last
df_test['BoxCoxTimeSinceLast'] = pd.DataFrame( data = samples, columns = ['Time Since Last']  )
boxcox_samples = stats.boxcox(samples.values)[0]
df_test['BoxCoxTimeSinceLast'] = boxcox_samples
lambd

0.29660702156034996

### Step B5: BoxCox Transformation for "Time Since First Donation" on the TRAINING dataset

In [425]:
samples = df.time_since_first
df['BoxCoxTimeSinceFirst'] = pd.DataFrame( data = samples, columns = ['Time Since First']  )
boxcox_samples = stats.boxcox(samples.values)[0]
lambd = stats.boxcox(samples.values)[1]
df['BoxCoxTimeSinceFirst'] = boxcox_samples
lambd

0.43090171088426804

### Step B6: BoxCox Transformation for "Time Since First Donation" on the TEST Dataset

In [426]:
samples = df_test.time_since_first
df_test['BoxCoxTimeSinceFirst'] = pd.DataFrame( data = samples, columns = ['Time Since First']  )
boxcox_samples = stats.boxcox(samples.values)[0]
df_test['BoxCoxTimeSinceFirst'] = boxcox_samples
lambd

0.43090171088426804

### Step B7: BoxCox Transformation for "Number of Donations" on the TRAINING dataset

In [427]:
samples = df.num_donations
df['BoxCoxNumDonations'] = pd.DataFrame( data = samples, columns = ['Number of Donations']  )
boxcox_samples = stats.boxcox(samples.values)[0]
lambd = stats.boxcox(samples.values)[1]
df['BoxCoxNumDonations'] = boxcox_samples
lambd

-0.055944168122910892

### Step B8: BoxCox Transformation for "Number of Donations" on the TEST dataset

In [428]:
samples = df_test.num_donations
df_test['BoxCoxNumDonations'] = pd.DataFrame( data = samples, columns = ['Number of Donations']  )
boxcox_samples = stats.boxcox(samples.values)[0]
df_test['BoxCoxNumDonations'] = boxcox_samples
lambd

-0.055944168122910892

### Step B9: Seeking out the Maximum and Minimum values on the Training Dataset - Feature Engineering

In [429]:
df.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first,donated,BoxCoxTimeSinceLast,BoxCoxTimeSinceFirst,BoxCoxNumDonations
0,619,2.0,50,98,1,0.769545,14.415009,3.513533
1,664,0.0001,13,28,1,-3.151987,7.433777,2.389417
2,441,1.0,16,35,1,0.0,8.418273,2.56826
3,160,2.0,20,45,1,0.769545,9.646522,2.758155
4,358,1.0,24,77,0,0.0,12.763193,2.91156


In [430]:
df_test.head()

Unnamed: 0,id,time_since_last,num_donations,time_since_first,BoxCoxTimeSinceLast,BoxCoxTimeSinceFirst,BoxCoxNumDonations
0,659,2.0,12,52,0.768659,10.538364,2.237153
1,276,21.0,7,38,4.918432,8.899499,1.791687
2,263,4.0,1,4,1.710664,1.90348,0.0
3,303,11.0,11,38,3.479509,8.899499,2.166632
4,83,4.0,12,34,1.710664,8.370065,2.237153


In [431]:
df.max()

id                      747.000000
time_since_last          74.000000
num_donations            50.000000
time_since_first         98.000000
donated                   1.000000
BoxCoxTimeSinceLast       8.713513
BoxCoxTimeSinceFirst     14.415009
BoxCoxNumDonations        3.513533
dtype: float64

In [432]:
df.min()

id                      0.000000
time_since_last         0.000100
num_donations           1.000000
time_since_first        2.000000
donated                 0.000000
BoxCoxTimeSinceLast    -3.151987
BoxCoxTimeSinceFirst    0.807785
BoxCoxNumDonations      0.000000
dtype: float64

### Step B10: Seeking a connection between the Number of donations and the Likelihood of donating

In [433]:
#Trying for some feature engineering
#print("On the training Dataset, percentage of donors is %0.4f " % (df[df.donated == 1.000].num_donations.shape[0]/df.num_donations.shape[0]))
#print("On the test Dataset, percentage of donors is %0.4f " % (df_test[df_test.donated == 1.000].num_donations.shape[0]/df_test.num_donations.shape[0]))
df[(df.donated == 1.000) & (df.BoxCoxNumDonations > 2.0) ].num_donations.shape[0]/df[df.BoxCoxNumDonations > 2.0].num_donations.shape[0]
df[(df.donated == 1.000) & (df.num_donations > 30.0) ].num_donations.shape[0]/df[df.num_donations > 30.0].num_donations.shape[0]

percentage = []
for k in range(0,50, 5):
    chances = df[(df.donated == 1) & (df.num_donations > k) ].num_donations.shape[0]/df[df.num_donations > k].num_donations.shape[0]
    percentage.append(chances)
print (percentage)

percentage = []
for k1 in range(0,8,1):
    
    k = 0.5*k1
    percentage.append(k)
    chances = df[(df.donated == 1) & (df.BoxCoxNumDonations > k) ].BoxCoxNumDonations.shape[0]/df[df.BoxCoxNumDonations > k].BoxCoxNumDonations.shape[0]
    percentage.append(chances)
print (percentage)


[0.23958333333333334, 0.3645320197044335, 0.38028169014084506, 0.4642857142857143, 0.5454545454545454, 0.6666666666666666, 0.6666666666666666, 0.6, 0.75, 1.0]
[0.0, 0.27292576419213976, 0.5, 0.27292576419213976, 1.0, 0.2949061662198391, 1.5, 0.3515625, 2.0, 0.3627450980392157, 2.5, 0.45161290322580644, 3.0, 0.6666666666666666, 3.5, 1.0]


### There is a greater chance of a person donating as his/her donations go higher.  
### The idea is to create new features based on the Number of Donations column and see if the precitions improve

### Step B11: Convert the the Transformed Number of Donations to whole numbers and get Dummy columns

In [434]:
df['NDceil'] = np.ceil(df.BoxCoxNumDonations)
df['NDceil'] = df['NDceil'].astype(int)
dummy_ranks = pd.get_dummies(df['NDceil'], prefix='BCND')
dummy_ranks.head()

Unnamed: 0,BCND_0,BCND_1,BCND_2,BCND_3,BCND_4
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


### Step B12: Create a new dataframe with only the required columns. Also, drop one of the dummy columns

In [435]:
cols_to_keep = ['id', 'BoxCoxTimeSinceLast', 'BoxCoxTimeSinceFirst','donated']
data = df[cols_to_keep].join(dummy_ranks.ix[:, 'BCND_1':])
print (data.head())

    id  BoxCoxTimeSinceLast  BoxCoxTimeSinceFirst  donated  BCND_1  BCND_2  \
0  619             0.769545             14.415009        1     0.0     0.0   
1  664            -3.151987              7.433777        1     0.0     0.0   
2  441             0.000000              8.418273        1     0.0     0.0   
3  160             0.769545              9.646522        1     0.0     0.0   
4  358             0.000000             12.763193        0     0.0     0.0   

   BCND_3  BCND_4  
0     0.0     1.0  
1     1.0     0.0  
2     1.0     0.0  
3     1.0     0.0  
4     1.0     0.0  


### Step B13: Split the training data into train and validation sets

In [436]:
X = data[ ['BoxCoxTimeSinceLast','BoxCoxTimeSinceFirst', 'BCND_1', 'BCND_2', 'BCND_3', 'BCND_4'] ].values
y = data['donated'].values

In [437]:
seed = 8
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)

print(X_train.shape)
print(X_valid.shape)

(460, 6)
(116, 6)


### Step B14: Obtaining the best accuracy score for the Validation Set using KNN

In [438]:
acc = []
best_score = 0
best_lm = 2
for k in range(2,20, 1):
    clf = KNeighborsClassifier(k)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_valid)
    score = accuracy_score(y_valid, y_hat )
    acc.append(score)
    if score > best_score:
        best_lm = k
        best_score = score
        print("best accuracy score %s"% (score))
        print(best_lm)
    #print("k: %s accuracy: %0.4f missed %s"% (k, score, np.count_nonzero(y_hat - y_test))   )

best accuracy score 0.75
2
best accuracy score 0.758620689655
8
best accuracy score 0.76724137931
11


### Step B15: Repeating the process for Test Dataset

In [439]:
df_test['NDceil'] = np.ceil(df_test.BoxCoxNumDonations)
df_test['NDceil'] = df_test['NDceil'].astype(int)
dummy_ranks = pd.get_dummies(df_test['NDceil'], prefix='BCND')
dummy_ranks.head()

Unnamed: 0,BCND_0,BCND_1,BCND_2,BCND_3,BCND_4
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [440]:
cols_to_keep = ['id', 'BoxCoxTimeSinceLast', 'BoxCoxTimeSinceFirst']
data_test = df_test[cols_to_keep].join(dummy_ranks.ix[:, 'BCND_1':])
print (data_test.head())

    id  BoxCoxTimeSinceLast  BoxCoxTimeSinceFirst  BCND_1  BCND_2  BCND_3  \
0  659             0.768659             10.538364     0.0     0.0     1.0   
1  276             4.918432              8.899499     0.0     1.0     0.0   
2  263             1.710664              1.903480     0.0     0.0     0.0   
3  303             3.479509              8.899499     0.0     0.0     1.0   
4   83             1.710664              8.370065     0.0     0.0     1.0   

   BCND_4  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  


In [441]:
X_test = data_test[ ['BoxCoxTimeSinceLast','BoxCoxTimeSinceFirst', 'BCND_1', 'BCND_2', 'BCND_3', 'BCND_4'] ].values
#From the validation data
k = 11 
clf = KNeighborsClassifier(k)
clf.fit(X_train, y_train)
y_hat= clf.predict(X_test)
print(y_hat)

[0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [442]:
y3 = y_hat.reshape(-1,1)
dfy3 = pd.DataFrame({'donated':y3[:,0],'': df_test['id']})
dfy3.head()

Unnamed: 0,Unnamed: 1,donated
0,659,0
1,276,0
2,263,0
3,303,0
4,83,1


In [443]:
import csv
dfy3.to_csv('BloodDonationSubmissionFormat_FE_BC.csv', index=False)