In [89]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [90]:
train = pd.read_csv("train_set.csv")
test = pd.read_csv("test_set.csv")

In [91]:
dummy_cols = ['funder', 'installer', 'basin', 'public_meeting', 'scheme_management'
             , 'permit', 'construction_year', 'extraction_type_class', 'payment', 
              'water_quality', 'quantity', 'source', 'source_class', 'waterpoint_type',
             'waterpoint_type_group']

train = pd.get_dummies(train, columns= dummy_cols)

train = train.sample(frac=1).reset_index(drop=True)

In [92]:
test = pd.get_dummies(test, columns=dummy_cols)

In [93]:
target = train.status_group
features = train.drop('status_group', axis=1)

X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)

In [94]:
def model_random_forest(X_train, X_val, y_train, y_val, test):
    if __name__ == '__main__':
    
        rf = RandomForestClassifier(criterion='gini',
                                   max_features='auto',
                                   min_samples_split=6,
                                   oob_score=True,
                                   random_state=1,
                                   n_jobs=-1)

        param_grid = {'n_estimators' : [500,750,1000]}

        gs = GridSearchCV(estimator=rf,
                         param_grid=param_grid,
                         scoring='accuracy',
                         cv=2,
                         n_jobs=-1)

        gs = gs.fit(X_train, y_train)

        best_params = gs.best_params_
        cv_results = gs.cv_results_
        validation_accuracy = gs.score(X_val, y_val)

        print("Validation accuracy: ", validation_accuracy)
        print(best_params)
        print(cv_results)
    

In [95]:
#model_random_forest(X_train, X_val, y_train, y_val, test)

# ('Validation accuracy: ', 0.79436026936026938)
# {'n_estimators': 750}
# {'std_train_score': array([ 0.00113211,  0.00109002,  0.00100586]), 'rank_test_score': array([3, 1, 2])
#  , 'mean_score_time': array([ 5.005     ,  4.06449986,  4.84350002]), 
#  'std_test_score': array([ 0.00102219, 0.00104325,  0.00112742]), 
#  'split1_train_score': array([ 0.89781575,  0.89789992,  0.89832078]), 
#  'split0_test_score': array([ 0.78633054,  0.78666723,  0.78633054]), 
#  'mean_test_score': array([ 0.78735269,  0.78771044,  0.78745791]), 
#  'split0_train_score': array([ 0.90007997,  0.90007997,  0.90033251]), 
#  'params': ({'n_estimators': 500}, {'n_estimators': 750}, {'n_estimators': 1000}), 
#  'std_fit_time': array([ 2.05050004,  0.36849999,  0.01399994]), 
#  'std_score_time': array([ 0.76100004,  0.20749998,  0.12749994]), 
#  'param_n_estimators': masked_array(data = [500 750 1000],
#              mask = [False False False],
#        fill_value = ?)
# , 'mean_train_score': array([ 0.89894786,  0.89898994,  0.89932664]), 
#  'mean_fit_time': array([ 58.68850005,  70.90350008,  81.10700011]), 
#  'split1_test_score': array([ 0.78837493,  0.78875374,  0.78858538])}

In [96]:
test_id = pd.read_csv("SubmissionFormat.csv")
test_id.columns = ['idd', 'status_group']
test_id = test_id.idd

In [None]:
def model_for_submission_rf(features, target, test):
    if __name__== '__main__':
        
        best_params = { 'criterion': ['gini'],
                           'max_features': ['auto'],
                           'min_samples_split': [6],
                           'random_state': [1],
                           'n_estimators': [750],
                            'oob_score': [1]}
        estimator = GridSearchCV(estimator=RandomForestClassifier(),
                                param_grid=best_params,
                                n_jobs=-1)
        estimator.fit(features, target)
        
        predictions = estimator.predict(test)
        
        data = {'ID' : test_id, 'status_group' : predictions}
        
        submit = pd.DataFrame(data=data)
        
        vals_to_replace = {2: 'Functional', 1: 'functional needs repair', 0: 'non functional'}
        
        submit.status_group = submit.status_group.replace(vals_to_replace)
        submit.to_csv('pump_predictions_rf.csv', index=False)

In [None]:
model_for_submission_rf(features, target, test)