In [None]:
import data_clean #this file does the preprocessing in the "Preparing the Data" section below. I threw it all in a function so I could easily perform the same steps on the holdout_.csv data as the training data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics.scorer import make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
%matplotlib inline

## Overview

From the "Created Date Time" variable, I added a variable corresponding to month and day of week. I also used zip code median and mean income (the mean income came from a dataset I got from the IRS SOI database; the median income I scraped from http://www.incomebyzipcode.com). I discarded the variables "Unnamed: 0," "System ID," and "Applicant City." To deal with missing values, for the categorical variables, I simply treated missing observations as another category; for the inherently continuous variables (i.e., email, birthdate, and the income variables) I replaced missing observations with the median value. In order to get the holdout data to look like the preprocessed training data (because of encoding issues) I, first, changed the 'Birthdate' variable from a date into days, and, second, appeneded the 35,000 or so "holdout" observations to the end of the training data, then preprocessed everything together, and then separated the datasets again. To generate the predictions I did a (simple) cross-validation grid search using a random forest model. A description of the key features shown in the feature importance diagram is provided below. 

Included in the folder are the following files...
* this notebook
* a python files with all of the below (I don't like working in notebooks usually) (brent_travis_howe_svg_ass_2017.01.13.py and data_clean.py)
* the python file used to scrape the income data (zipwho_scrape.py)
* the median income data set (income_by_zipcode.txt; the mean income dataset was too big...if you search for the file (14zpallagi.csv) online it will come up)
* and the feature importances image (feature_importances.png)
* the holdout\_.csv file with my predictions (holdout_predict.csv)

## Preparing the Data

In [None]:
df1 = pd.read_csv('/Users/brenthowe/datascience/data sets/svg/train_test_.csv')
le = LabelEncoder()

df1['target'].fillna(value=0, inplace=True)

# print df.info()

In [None]:
df2 = pd.read_csv(path2)
df2['Birthdate'] = df2['Birthdate'].astype(str).apply(lambda x: x[:-2] + '19' + x[-2:] if ((x!='nan') and (x[-2:]!='00')) else x)
df2['Birthdate'] = df2['Birthdate'].astype(str).apply(lambda x: x[:-2] + '20' + x[-2:] if ((x!='nan') and (x[-2:]=='00')) else x)
df2['Birthdate'] = pd.to_datetime(df2['Birthdate'])
df2['Birthdate'] = (pd.to_datetime('2017/01/17') - df2['Birthdate']).dt.days
df2['Birthdate'].fillna(value=df2['Birthdate'].median(), inplace=True)


df = df1.append(df2)

In [None]:
# print pd.DataFrame(df['Unnamed: 0'].unique()).describe() # This looks like a unique identifier
# print pd.DataFrame(df['System ID'].unique()).describe() # 137042 unique values here...likely doesn't carry any useful information

df.drop(['Unnamed: 0', 'System ID'], 1, inplace=True)


In [None]:
# assume Created Date Time is correlated with purchasing opportunity day

df['Created Date Time'] = pd.to_datetime(df['Created Date Time']) # convert to pandas datetime object
df['day_of_week'] = df['Created Date Time'].dt.dayofweek # create day of week variable
df['month'] = df['Created Date Time'].dt.month # create month variable

# no major holidays in dataset time frame so won't create a holiday dummy

df.drop(['Created Date Time'], 1, inplace=True)

# print df.info()

In [None]:
# print pd.DataFrame(df['Neustar Result Code'].unique()).describe() #6 unique values
# print df['Neustar Result Code'].unique()

df['Neustar Result Code'] = df['Neustar Result Code']
# df.ix[df['Neustar Result Code'].isnull(), 'Neustar Result Code'] = -1
df['Neustar Result Code'].fillna(value=999, inplace=True)

# print df['Neustar Result Code'].unique()

# print df.info()

In [None]:
# print pd.DataFrame(df['Lead Source'].unique()).describe() #183 unique values

le.fit(df['Lead Source'])
df['Lead Source'] = le.transform(df['Lead Source']) 

# df.info()

In [None]:
# print df['Smoker'].unique()

df.ix[df['Smoker']=='FALSE', 'Smoker'] = 'No'
df.ix[df['Smoker']=='OE State (Not Required)', 'Smoker'] = np.NaN
df.ix[df['Smoker']=='N', 'Smoker'] = 'No'
df.ix[df['Smoker']=='1', 'Smoker'] = 'Yes'
df.ix[df['Smoker']=='TZT.Leads.Runtime.Domain.Models.Field', 'Smoker'] = np.NaN
df.ix[df['Smoker']=='TRUE', 'Smoker'] = 'Yes'
df.ix[df['Smoker']=='Y', 'Smoker'] = 'Yes'

df['Smoker'].fillna(value='0', inplace=True)

le.fit(df['Smoker'])
df['Smoker'] = le.transform(df['Smoker']) 

# print df['Smoker'].describe()

# print df.info()

In [None]:
# print df['Emails'].unique()

df['Emails'].fillna(value=df['Emails'].median(), inplace=True) #use median because the distribution is skewed

le.fit(df['Emails'])
df['Emails'] = le.transform(df['Emails']) 


In [None]:
# Looks like birthdate may be age in days, so I'll leave it as it is.

# Use median value in place of a missing value
df['Birthdate'].fillna(value=df['Birthdate'].median(), inplace=True) #use median because the distribution is skewed

# print df.info()

In [None]:
# print df['Gender'].unique()

df.ix[df['Gender']=='F', 'Gender'] = 'Female'
df.ix[df['Gender']=='M', 'Gender'] = 'Male'

df['Gender'].fillna(value='0', inplace=True)

le.fit(df['Gender'])
df['Gender'] = le.transform(df['Gender']) 

# print df['Gender'].describe()

# print df.info()

In [None]:
# print df['Applicant State/Province'].unique()

df['Applicant State/Province'].fillna(value='0', inplace=True)

le.fit(df['Applicant State/Province'])
df['Applicant State/Province'] = le.transform(df['Applicant State/Province'])

# print df.info()

In [None]:
df['l'] = df['Applicant Zip/Postal Code'].astype(str).apply(lambda x: len(x))
df.ix[(df['l']>5) & (df['l']<10), 'Applicant Zip/Postal Code'] = np.NaN

df['Applicant Zip/Postal Code'] = df['Applicant Zip/Postal Code'].astype(str).apply(lambda x: x.split('-')[0] if len(x)>5 else x)
df['Applicant Zip/Postal Code'] = df['Applicant Zip/Postal Code'].astype(str).apply(lambda x: '0' + x if len(x)==4 else x)

# d = df[df['l']==3]
# print d['Applicant Zip/Postal Code'].unique()

df['Applicant Zip/Postal Code'] = df['Applicant Zip/Postal Code'].astype(str).apply(lambda x: '00' + x if (len(x)==3) & (x!='nan') else x)

# df['Applicant Zip/Postal Code'] = df['Applicant Zip/Postal Code'].astype(float)
# df.ix[df['Applicant Zip/Postal Code'].isnull(), 'Applicant Zip/Postal Code'] = -1

df['zip'] = df['Applicant Zip/Postal Code']

df.drop(['l', 'Applicant Zip/Postal Code'], 1, inplace=True)

In [None]:
def zip_average_income():
    df = pd.read_csv('/Users/brenthowe/datascience/galvanize/project/data/14zpallagi.csv')
    df_sum = df.groupby('zipcode')[['N02650', 'A02650']].sum()
    df_sum['mean_income'] = df_sum['A02650']/df_sum['N02650']
    df_sum.drop(['N02650', 'A02650'], 1, inplace=True)
    df_sum['zip'] = df_sum.index.astype('str')
    df_sum['zip'] = df_sum.zip.apply(lambda x: x.zfill(5))
    df_sum.set_index('zip', inplace=True)

    us_income = float(df_sum.loc['00000'].values)
    df_sum.drop(df_sum.index[0], inplace=True)
    df_sum['zip'] = df_sum.index
    return us_income, df_sum

In [None]:
us_income, df_sum = zip_average_income()

df_sum['mean_income'] = df_sum['mean_income']*1000
df = df.merge(df_sum, how='left', on='zip')

print df['mean_income'].mean()
df['mean_income'].fillna(value=df['mean_income'].mean(), inplace=True) #approximate mean income in the United States

# print df.info()

In [None]:
df_median = pd.read_csv('income_by_zipcode.txt', sep='\t', index_col=False)
df_median['zip'] = df_median['zipcode, median_income'].str[:5]
df_median['zip'] = df_median['zip'].apply(lambda x: '0'+ x if len(x)==4 else x)
df_median['median_income'] = df_median['zipcode, median_income'].str[8:].apply(lambda x: x.replace(",",""))
df_median.drop(['zipcode, median_income'], 1, inplace=True)
df_median.drop([11009, 13831, 16723, 16728, 16829, 17182, 17214, 17513], inplace=True) #get rid of duplicates in the df_median dataset
df = df.merge(df_median, how='left', on='zip')
df.ix[(df.median_income == '') | (df.median_income == ' '), 'median_income'] = np.nan
df.median_income = df.median_income.astype(float)
df['median_income'].fillna(value='51939', inplace=True) #approximate median income in the United States
df['median_income'] = df['median_income'].astype(float)
df.drop(['zip', 'Applicant City'], 1, inplace=True)# print df.info()

In [None]:
y = df.pop('target')
ind_holdout = y[y.isnull()].index
ind_train = y[y.notnull()].index

# the following is to help with the interpretation of the features in the feature importances plot
print df.info()
num = 266
print "median_income: {0}".format(num)
num+=-1
print "mean_income: {0}".format(num)
num+=-len(df.month.unique())
print "month: {0}".format(num)
num+=-len(df.day_of_week.unique())
print "day_of_week: {0}".format(num)
num+=-len(df.Smoker.unique())
print "Smoker: {0}".format(num)
num+=-len(df['Neustar Result Code'].unique())
print "Neustar Result Code: {0}".format(num)
num+=-len(df['Lead Source'].unique())
print "Lead Source: {0}".format(num)
num+=-len(df.Gender.unique())
print "Gender: {0}".format(num)
num+=-1
print "Emails: {0}".format(num)
num+=-1
print "Birthdate: {0}".format(num)
num+=-len(df['Applicant State/Province'].unique())
print "Applicant State/Province: {0}".format(num)

In [None]:
enc = OneHotEncoder(categorical_features = [0, 3, 4, 5, 6, 7, 8])

X = enc.fit_transform(df).toarray()

X_test = pd.DataFrame(X).iloc[ind_holdout]
X_train = pd.DataFrame(X).iloc[ind_train]
y_train = y.iloc[ind_train]

# print df.info()

## Training the Model

In [None]:
# As you probably guessed, this takes a while to run
rf = RandomForestClassifier()

param_dict = {'n_estimators': [10, 25, 45, 55],
    'max_features': ['auto', 25, 35, 50]}
gsCV_rf = GridSearchCV(rf, param_dict, n_jobs = -1, scoring='roc_auc')
gsCV_rf.fit(X_train, y_train)

print gsCV_rf.best_params_
print gsCV_rf.best_score_

## Generating Predictions and Appending to File

In [None]:
holdout = pd.read_csv('/Users/brenthowe/datascience/data sets/svg/holdout_.csv')
holdout['prediction_proba'] = gsCV_rf.predict_proba(X_test)[:,1] # generates predictions here
holdout.to_csv('/Users/brenthowe/datascience/data sets/svg/holdout_predict.csv')

## Feature Importances

In [None]:
# feature importance plot (I don't know of a way of getting this from the gsCV_rf object from above, so I estimate again using the best parameters from the grid search)
rf = RandomForestClassifier(n_estimators=55, max_features=25)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(15), importances[indices[:15]], color="r", yerr=std[indices[:15]], align="center")
plt.xticks(range(15), indices[:15])
plt.xlim([-1, 15])
# plt.show()
plt.savefig('feature_importances.png')

Feature numbers 250 and 251 Neustar Result Codes (I don't know what this variable corresponds to). Feature 263 and 264 are months (I don't know exactly how it was encoded but I think 263 is October and 264 is November). Features 265 and 266 are mean and median income (by zip code), respectively. Finally, feature 52 corresponds to age (i.e., Birthdate), in days. From this plot we see that the months and incomes variables are consistently influential in the classifier making good predictions. 