In [1]:
import pandas as pd
import numpy as np

# Sci-kit learn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Load the training data and split the data into a training and testing subset for validation.

In [2]:
training_data = pd.read_csv('./data/modellingProjectTraining.txt', delimiter=';')
training_data.head()

Unnamed: 0,Customer_ID,Customer_Value,Gender,Age,Income,CE_Pet_Model,CE_Frozen_Meal_Model,CE_Donations_Model,CE_Foreign_Travel_Model,CE_Physical_Fitness_Model,CE_Pro_Tax_Preparation_Model,CE_Golf_Model,CE_Higher_Education_Model,CE_Avid_TV_Model,CE_Social_Networking_Model
0,1,171.057,M,73,93000,4.084,14.942,3.944,4.938,1.591,-1.676,5.389,-3.724,4.046,6.918
1,2,51.686,M,42,64000,2.431,1.411,-0.362,6.078,1.93,0.851,1.539,-1.177,-2.237,0.109
2,3,88.558,M,60,74000,1.915,8.059,0.617,3.281,0.811,-0.838,2.239,0.231,-2.062,-5.429
3,4,24.079,M,25,65000,-0.331,3.18,-3.364,-3.395,0.401,0.178,1.11,-2.834,0.745,-3.763
4,5,46.308,M,39,62000,4.652,7.882,0.653,2.94,1.751,2.737,0.35,5.628,-5.427,-8.007


In [3]:
def gender_encoder(gender):
    ''' gender_encoder takes in either a string,'F' or 'M', and returns either 1 or 0, respectively. 
    If the string is neither 'M' or 'F', gender_encoder returns none.'''
    if gender == 'M':
        return 0
    elif gender == 'F':
        return 1
    else:
        return None

In [4]:
# Split into inputs and targets.
Xdata, Ydata = training_data.ix[:,2:], training_data.ix[:,1]
# Encode the cendor as a binary integer value.
Xdata['Gender'] = Xdata['Gender'].apply(gender_encoder)

In [5]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xdata, Ydata, test_size=.25)

# Build Model

In [6]:
# Grid search over estimators and depth of the random forest regressor.
grid_search = GridSearchCV(RandomForestRegressor(), param_grid={'n_estimators': [2, 5, 10], 'max_depth':[5,10]})
grid_search.fit(Xtrain, Ytrain)
grid_search.grid_scores_

[mean: 0.28555, std: 0.00672, params: {'n_estimators': 2, 'max_depth': 5},
 mean: 0.29031, std: 0.00526, params: {'n_estimators': 5, 'max_depth': 5},
 mean: 0.29308, std: 0.00479, params: {'n_estimators': 10, 'max_depth': 5},
 mean: 0.42507, std: 0.00622, params: {'n_estimators': 2, 'max_depth': 10},
 mean: 0.45049, std: 0.00498, params: {'n_estimators': 5, 'max_depth': 10},
 mean: 0.45767, std: 0.00488, params: {'n_estimators': 10, 'max_depth': 10}]

In [7]:
# Use best scoring parameters for the model
random_forest_model = RandomForestRegressor(**grid_search.best_params_)
random_forest_model.fit(Xtrain, Ytrain)

# Confirm model preformance on reserved testing data
random_forest_score = random_forest_model.score(Xtest, Ytest)
print "Model R^2 value %.2f" %random_forest_score

Model R^2 value 0.45


In [9]:
# Retrain the data on the full training set.
random_forest_model.fit(Xdata, Ydata)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Produce Predictions

In [10]:
prediction_data =  pd.read_csv('./data/modellingProjectModelling.txt', delimiter=';')
prediction_data.head()

Unnamed: 0,Customer_ID,Gender,Age,Income,CE_Pet_Model,CE_Frozen_Meal_Model,CE_Donations_Model,CE_Foreign_Travel_Model,CE_Physical_Fitness_Model,CE_Pro_Tax_Preparation_Model,CE_Golf_Model,CE_Higher_Education_Model,CE_Avid_TV_Model,CE_Social_Networking_Model
0,2000000,M,70,37000,-0.948,-4.654,2.314,-0.962,3.077,1.218,2.613,1.19,-4.215,-0.534
1,2000001,F,27,135000,2.128,12.758,0.856,-1.899,1.155,1.797,6.129,-0.819,-0.123,1.606
2,2000002,M,31,88000,1.737,-0.312,-1.34,-6.317,4.857,4.189,0.897,2.467,-2.471,1.151
3,2000003,M,44,128000,3.18,-3.67,1.393,0.655,2.211,3.477,-5.705,4.287,-1.47,5.401
4,2000004,F,48,135000,1.929,8.7,1.614,-0.443,-2.909,1.752,1.016,0.408,-2.552,-6.224


In [12]:
prediction_Xdata = prediction_data.ix[:, 1:] # Drop the customer_id from the model
prediction_Xdata['Gender'] = prediction_data['Gender'].apply(gender_encoder)

In [14]:
results = random_forest_model.predict(prediction_Xdata)

In [15]:
# Write results to customer_model_out.txt as ; deliminated file with a header.
with open('customer_model_out.txt', 'w') as f:
    f.write('"Customer_ID";"Predicted_Score"\n')
    for customer_id, result in zip(prediction_data['Customer_ID'], results):
        f.write(";".join([str(customer_id), str(result)]) + "\n")