In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
%matplotlib inline

In [2]:
import sklearn.grid_search as gs
import sklearn.cross_validation as cv
import sklearn.ensemble as ens
import sklearn.metrics as mts

In [3]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

scorer = mts.make_scorer(Gini)

In [5]:
train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test = pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

full = pd.concat(objs = [train, test])

full.drop(['T1_V10', 'T1_V13','T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,num_mask]
full_cat = full.iloc[:, ~num_mask]
cat_names = full_cat.columns
form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)
train.shape, full_num.shape, full_cat.shape, x_dummies.shape, full_dummies.shape
split = np.isnan(full_dummies.Hazard)
train = full_dummies.loc[~split,:]
test  = full_dummies.loc[split ,:]
x = train.drop('Hazard', axis=1).values
y = train.Hazard.values


seed = 1

In [14]:
mod_rf = ens.RandomForestClassifier(n_estimators = 1000,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    oob_score=True,
                                    class_weight='auto'
                                   )
mod_rf

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features=36, max_leaf_nodes=None,
            min_samples_leaf=13, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [15]:
mod_rf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features=36, max_leaf_nodes=None,
            min_samples_leaf=13, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [17]:
mod_rf.oob_score_

0.17539559599207827

In [9]:
mod_rf.oob_score_

0.3628110355105002

In [14]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 30,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    oob_score=True,
                                    class_weight='auto'
                                   )


cv.cross_val_score(mod_rf2, x, y, scoring=scorer, cv=5, n_jobs=-1, verbose=0)



array([ 0.25759035,  0.25511524,  0.24731223,  0.24771962,  0.23960193])

In [15]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 30,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13
                                   )


cv.cross_val_score(mod_rf2, x, y, scoring=scorer, cv=5, n_jobs=-1, verbose=0)



array([ 0.02481221, -0.00443001, -0.02565225, -0.01943017,  0.01835571])

In [12]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 30,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                     class_weight = 'auto'
                                   )


cv.cross_val_score(mod_rf2, x, y, cv=5, n_jobs=-1, verbose=0)



array([ 0.12502446,  0.12872257,  0.11972936,  0.11539594,  0.12465619])

In [13]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 30,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13
                                   )


cv.cross_val_score(mod_rf2, x, y, cv=5, n_jobs=-1, verbose=0)



array([ 0.36812757,  0.36912226,  0.37036674,  0.37003238,  0.37062868])

In [9]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 1000,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    class_weight='auto'
                                   )


cv.cross_val_score(mod_rf2, x, y, scoring=scorer, cv=5, n_jobs=-1, verbose=0)



KeyboardInterrupt: 

In [10]:
mod_ext = ens.ExtraTreesClassifier(n_estimators = 1000,
                                   bootstrap = True,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    oob_score=True
                                   )
mod_ext.fit(x,y)
mod_ext.oob_score_

0.37214455185395789

In [11]:
mod_ext = ens.ExtraTreesClassifier(n_estimators = 1000,
                                   bootstrap = True,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    oob_score=True,
                                   class_weight='auto'
                                   )
mod_ext.fit(x,y)
mod_ext.oob_score_

0.14996372477891723

In [22]:
mod_ext2 = ens.ExtraTreesClassifier(n_estimators = 30,
                                    bootstrap = True,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                    oob_score=True
                                   )
mod_ext2.fit(x,y)
mod_ext2.oob_score_

0.36863467911135511

In [18]:
mod_ext3 = ens.ExtraTreesClassifier(n_estimators = 300,
                                    bootstrap = True,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                   )

cv.cross_val_score(mod_ext3, x, y, cv=5, n_jobs=-1, verbose=0)



array([ 0.37106241,  0.37176724,  0.37222985,  0.3727799 ,  0.37298625])

In [19]:
mod_ext3 = ens.ExtraTreesClassifier(n_estimators = 300,
                                    bootstrap = True,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13,
                                   )

cv.cross_val_score(mod_ext3, x, y, cv=5, scoring = scorer, n_jobs=-1, verbose=0)



array([ 0.00273653, -0.02055012, -0.04512233, -0.03514124, -0.00120721])

In [17]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 300,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13
                                   )


cv.cross_val_score(mod_rf2, x, y, cv=5, n_jobs=-1, verbose=0)



array([ 0.37037762,  0.37127743,  0.37213179,  0.37228927,  0.37318271])

In [20]:
mod_rf2 = ens.RandomForestClassifier(n_estimators = 300,
                                    random_state = seed,
                                    n_jobs = -1,
                                    max_features=36,
                                    min_samples_leaf=13
                                   )


cv.cross_val_score(mod_rf2, x, y, cv=5, scoring=scorer, n_jobs=-1, verbose=0)



array([ 0.00496438, -0.01778813, -0.03808907, -0.02982595,  0.00165149])