In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
%matplotlib inline

In [2]:
import sklearn.grid_search as gs
import sklearn.cross_validation as cv
import sklearn.ensemble as ens
import sklearn.metrics as mts

In [4]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

scorer = mts.make_scorer(Gini)

In [4]:
train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test = pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

full = pd.concat(objs = [train, test])

full.drop(['T1_V10', 'T1_V13','T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,num_mask]
full_cat = full.iloc[:, ~num_mask]
cat_names = full_cat.columns
form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)
train.shape, full_num.shape, full_cat.shape, x_dummies.shape, full_dummies.shape
split = np.isnan(full_dummies.Hazard)
train = full_dummies.loc[~split,:]
test  = full_dummies.loc[split ,:]
x = train.drop('Hazard', axis=1).values
y = train.Hazard.values


seed = 1

mod_rf = ens.RandomForestRegressor(n_estimators = 30, random_state = seed, n_jobs = -1)

params_rf = {'max_features': [38,40,42],
             'min_samples_leaf': [8,9,10]}

grid_rf = gs.GridSearchCV(mod_rf, param_grid=params_rf, n_jobs=-1, scoring = scorer, cv=5)

grid_rf.fit(x,y)

grid_rf.best_score_

0.36132444808104458