In [178]:
import pandas as pd
import numpy as np
from patsy import dmatrix, dmatrices
%pylab inline
import warnings
warnings.filterwarnings('ignore')

pump_data = pd.read_csv('pump_training_data.csv')
pump_data['yr_recorded'] = pump_data['date_recorded'].apply(lambda x: x.split('-')[0])
test_data = pd.read_csv('pump_test_data.csv')
test_data['yr_recorded'] = test_data['date_recorded'].apply(lambda x: x.split('-')[0])
pump_status = pd.read_csv('pump_status_group.csv')


# district_dummies = pd.get_dummies(pump_data['district_code'], prefix='district_')
# pump_data = pd.concat([pump_data.drop('district_code', axis=1).T, district_dummies.T]).T
pump_data['district_code'] = pump_data['district_code'].apply(str)
test_data['district_code'] = test_data['district_code'].apply(str)

pump_data.loc[:, 'yr_recorded'] = pump_data.loc[:, 'yr_recorded'].convert_objects(convert_numeric=True)
pump_data['age'] = pump_data['yr_recorded'] - pump_data['construction_year']
test_data.loc[:, 'yr_recorded'] = test_data.loc[:, 'yr_recorded'].convert_objects(convert_numeric=True)
test_data['age'] = test_data['yr_recorded'] - test_data['construction_year']

pumps_train_clean = pd.read_csv('training_clean_dan.csv')
test_clean = pd.read_csv('test_clean_dan.csv')
test_clean['id'] = test_data['id']

pumps = pd.merge(pump_data, pump_status, on='id')

# no ward
pumps = pumps.drop(['date_recorded', 'quantity_group','wpt_name', 'subvillage', 'region_code', 'management_group',
                   'id', 'extraction_type_group', 'extraction_type_class', 'num_private', 'source_type', 'ward',
                   'quality_group', 'waterpoint_type_group', 'recorded_by', 'payment_type', 'scheme_name',
                   'yr_recorded', 'longitude', 'latitude', 'gps_height', 'population', 'construction_year'], axis=1)

test_data = test_data.drop(['date_recorded', 'quantity_group','wpt_name', 'subvillage', 'region_code', 'management_group',
                   'id', 'extraction_type_group', 'extraction_type_class', 'num_private', 'source_type', 'ward',
                   'quality_group', 'waterpoint_type_group', 'recorded_by', 'payment_type', 'scheme_name',
                   'yr_recorded', 'longitude', 'latitude', 'gps_height', 'population', 'construction_year'], axis=1)


# don't use latitude or longitude
# pumps = pumps.drop(['date_recorded', 'quantity_group','wpt_name', 'subvillage', 'region_code', 'management_group',
#                    'id', 'extraction_type_group', 'extraction_type_class', 'num_private', 'source_type', 'ward',
#                    'quality_group', 'waterpoint_type_group', 'recorded_by', 'payment_type', 'scheme_name',
#                    'latitude', 'longitude', 'yr_recorded], axis=1)


# Dan's list of dropped features
# "id", "subvillage","region_code", "village", "lga", "extraction_type_group", "extraction_type_class",
# "water_quality", "quality_group", "source", "source_class", "waterpoint_type_group", "recorded_by",
# "scheme_management", "scheme_name" , "public_meeting", "num_private", "amount_tsh", "dist_code",
# "gps_height", "wpt_name", "ward", "management_group", "payment_type", "quantity_group

# replace funder & installer with clean
pumps['funder'] = pumps_train_clean['funder']
pumps['installer'] = pumps_train_clean['installer']
test_data['funder'] = test_clean['funder']
test_data['installer'] = test_clean['installer']

# fill missing values
pumps.loc[pumps['permit'].isnull(), 'permit'] = 'Unknown'
pumps.loc[pumps['public_meeting'].isnull(), 'public_meeting'] = 'Unknown'
pumps.loc[pumps['scheme_management'].isnull(), 'scheme_management'] = 'Other'
test_data.loc[test_data['permit'].isnull(), 'permit'] = 'Unknown'
test_data.loc[test_data['public_meeting'].isnull(), 'public_meeting'] = 'Unknown'
test_data.loc[test_data['scheme_management'].isnull(), 'scheme_management'] = 'Other'

Populating the interactive namespace from numpy and matplotlib


In [179]:
formula = 'status_group ~ ' + ' + '.join(['Q("'+x+'")' for x in pumps.columns.values[:-1]])
formula

'status_group ~ Q("amount_tsh") + Q("funder") + Q("installer") + Q("basin") + Q("region") + Q("district_code") + Q("lga") + Q("public_meeting") + Q("scheme_management") + Q("permit") + Q("extraction_type") + Q("management") + Q("payment") + Q("water_quality") + Q("quantity") + Q("source") + Q("source_class") + Q("waterpoint_type") + Q("age")'

In [180]:
Y, X = dmatrices(formula, pumps, return_type='dataframe')
X.shape

(59400, 311)

In [181]:
test_formula = ' + '.join(['Q("'+x+'")' for x in test_data.columns.values[:-1]])
actual_test = dmatrix(test_formula, test_data, return_type='dataframe')
actual_test.shape

(14850, 308)

In [182]:
# remove columns from train and test that are not found in the other
x_not_actual = []
actual_not_x = []

for i in actual_test.columns.values:
    if i not in X.columns.values:
        actual_not_x.append(i)

for i in X.columns.values:
    if i not in actual_test.columns.values:
        x_not_actual.append(i)

X = X.drop(x_not_actual, axis=1)
actual_test = actual_test.drop(actual_not_x, axis=1)

print X.shape
print actual_test.shape

(59400, 305)
(14850, 305)


In [183]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn import metrics, tree

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

model_rf = RandomForestClassifier(n_estimators=50, random_state=1)
model_rf.fit(X_train, y_train)

prediction = model_rf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, prediction)
accuracy

0.76475869809203145

In [184]:
rf_pred = pd.DataFrame(prediction, columns=['functional', 'needs_repair', 'non_functional'])
y_true = pd.DataFrame(y_test, columns=['functional', 'needs_repair', 'non_functional'])

#pd.crosstab(rf_pred.index, y_true.columns, row_names=rf_pred.columns.values)
# pumps.groupby(['functional'])
y_true['status_group'] = 'functional'
y_true.loc[y_true['needs_repair']==1, 'status_group'] = 'needs_repair'
y_true.loc[y_true['non_functional']==1, 'status_group'] = 'non_functional'


rf_pred['status_group'] = 'functional'
rf_pred.loc[rf_pred['needs_repair']==1, 'status_group'] = 'needs_repair'
rf_pred.loc[rf_pred['non_functional']==1, 'status_group'] = 'non_functional'

rf_pred['true_status'] = y_true['status_group']


xtab = pd.crosstab(rf_pred['true_status'], rf_pred['status_group'], margins=True)
xtab

status_group,functional,needs_repair,non_functional,All
true_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
functional,8675,171,845,9691
needs_repair,757,326,187,1270
non_functional,1780,75,5004,6859
All,11212,572,6036,17820


In [185]:
feat_imp = pd.DataFrame(model_rf.feature_importances_, index=X.columns).sort(0, ascending=False)
no_importance = feat_imp[feat_imp[0]==0].index.values
no_importance

array(['Intercept'], dtype=object)

In [186]:
feat_imp[:10]

Unnamed: 0,0
"Q(""quantity"")[T.enough]",0.066501
"Q(""amount_tsh"")",0.050848
"Q(""waterpoint_type"")[T.other]",0.044582
"Q(""quantity"")[T.insufficient]",0.037885
"Q(""extraction_type"")[T.other]",0.029368
"Q(""quantity"")[T.seasonal]",0.019396
"Q(""funder"")[T.other]",0.017135
"Q(""waterpoint_type"")[T.communal standpipe]",0.01697
"Q(""permit"")[T.True]",0.015681
"Q(""public_meeting"")[T.True]",0.014563


In [187]:
feat_imp = pd.DataFrame(model_rf.feature_importances_, index=X.columns).sort(0, ascending=False)
no_importance = feat_imp[feat_imp[0]==0].index.values
no_importance

array(['Intercept'], dtype=object)

In [188]:
num_est = [50]
num_leaves = [500, 1000, 3000, 7000]
num_features = [75]
# still need to play with n_estimators

model_list = []
for i in num_est:
    for j in num_features:
        for k in num_leaves:
            # generate model to test
            model_tuple = (str(i)+'_trees_'+str(j)+'_feat_'+str(k)+'_leaf'
                           , RandomForestClassifier(n_estimators = i,
                                                    random_state = 58,
                                                    max_features = j,
                                                    max_leaf_nodes = k,
                                                    n_jobs=-1))
            # add to list
            model_list.append(model_tuple)

accuracy_train = {}
accuracy_test = {}
for (name, model) in model_list:
    model.fit(X_train, y_train)
    prediction_train = model.predict(X_train)
    accuracy_train[name] = metrics.accuracy_score(y_train, prediction_train)
    prediction_test = model.predict(X_test)
    accuracy_test[name] = metrics.accuracy_score(y_test, prediction_test)
    
df = pd.DataFrame({'Training Accuracy':accuracy_train, 'Test Accuracy':accuracy_test})
df.sort('Test Accuracy', ascending=False)
    


Unnamed: 0,Test Accuracy,Training Accuracy
50_trees_75_feat_3000_leaf,0.773401,0.847138
50_trees_75_feat_7000_leaf,0.771324,0.866114
50_trees_75_feat_1000_leaf,0.765376,0.798653
50_trees_75_feat_500_leaf,0.75651,0.775998


In [189]:
def convert_pred_to_df(np_array):
    pred_df = pd.DataFrame(test_clean['id'])
    pred_df['status_group'] = 'none'
    for i, ans in enumerate(np_array):
        if ans[0]==1:
            pred_df.loc[i, 'status_group'] = 'functional'
        elif ans[1]==1:
            pred_df.loc[i, 'status_group'] = 'functional needs repair'
        else:
            pred_df.loc[i, 'status_group'] = 'non functional'
    return pred_df

In [190]:
model_rf_test = RandomForestClassifier(n_estimators=90, 
                                       random_state=18, 
                                       max_features=75,
                                       max_leaf_nodes=6000)
model_rf_test.fit(X, Y)

actual_pred = model_rf.predict(actual_test)
submission = convert_pred_to_df(actual_pred)


In [191]:
submission.to_csv('submission1.csv', index=False)

In [192]:
df.sort('Training Accuracy', ascending=False)

Unnamed: 0,Test Accuracy,Training Accuracy
50_trees_75_feat_7000_leaf,0.771324,0.866114
50_trees_75_feat_3000_leaf,0.773401,0.847138
50_trees_75_feat_1000_leaf,0.765376,0.798653
50_trees_75_feat_500_leaf,0.75651,0.775998
