In [None]:
# from torchvision import transforms, models
# from torch.utils.data.sampler import SubsetRandomSampler
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, r2_score
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import Ridge
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Loading & Cleaning Datasets

In [21]:
train = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')
test = pd.read_csv('test_features.csv')
sample = pd.read_csv('sample_submission.csv')

In [22]:
train.shape

(59400, 40)

In [23]:
df = train
df_test = test
labels = train_labels

In [24]:
# dropping duplicate rows
df = df.drop_duplicates()

In [25]:
df.shape

(59400, 40)

In [26]:
df = train.drop(['scheme_name', 'num_private',
                     'date_recorded', 'ward', 'recorded_by',
                     'latitude', 'longitude', 'id'], axis=1)

In [27]:
# Filling NAs with mode for each respective column
fill_columns = ['public_meeting', 'permit', 'scheme_management',
                'funder', 'subvillage', 'installer']

for col in fill_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [28]:
# Filling 0s in 'construction_year' with ffill method
df['construction_year'] = df['construction_year'].replace(0, np.nan)
df['construction_year'] = df['construction_year'].fillna(method='ffill')

In [29]:
df.shape

(59400, 32)

In [30]:
df_clean = df

In [416]:
# df_clean.to_csv('df_clean1.csv')

In [31]:
df.shape, df_clean.shape

((59400, 32), (59400, 32))

## New Label Encoded Train Dataset

In [32]:
from sklearn.preprocessing import LabelEncoder

#Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

In [33]:
# Creating encoded dataframe (df_en)
# Using scikitlearn's LabelEncoder for categorical & object features
df_en = dummyEncode(df_clean)
df_en.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,6000.0,1369,1390,1518,37399,1,11807,3,11,5,51,109,True,7,False,1999.0,3,1,0,7,4,2,0,6,2,1,1,8,6,0,1,1
1,0.0,469,1399,545,37195,4,15838,9,20,2,103,280,True,2,True,2010.0,3,1,0,11,4,0,2,6,2,2,2,5,3,1,1,1
2,25.0,825,686,2048,14572,5,9074,8,21,4,108,250,True,7,True,2009.0,3,1,0,7,4,4,5,6,2,1,1,0,1,1,2,1
3,0.0,1741,263,1852,37285,7,8982,12,90,63,87,58,True,7,True,1986.0,14,10,5,7,4,0,2,6,2,0,0,3,0,0,2,1
4,0.0,20,0,119,35529,4,7698,4,18,1,26,0,True,7,True,1986.0,3,1,0,1,1,0,2,6,2,3,3,5,3,1,1,1


In [34]:
df_en.shape

(59400, 32)

## New Label Encoded Test Dataset (target)

In [35]:
df_test.shape

(14358, 40)

In [36]:
df_test1 = df_test.drop(['scheme_name', 'num_private',
                     'date_recorded', 'ward', 'recorded_by',
                     'latitude', 'longitude', 'id'], axis=1)

In [37]:
for col in fill_columns:
    df_test1[col] = df_test1[col].fillna(df_test1[col].mode()[0])

In [38]:
# Filling 0s in 'construction_year' with ffill method
df_test1['construction_year'] = df_test1['construction_year'].replace(0, np.nan)
df_test1['construction_year'] = df_test1['construction_year'].fillna(method='ffill')

In [39]:
df_test1c = df_test1

In [437]:
# df_test1c.to_csv('df_test_clean.csv')

In [40]:
df_test1.shape, df_test1c.shape

((14358, 32), (14358, 32))

In [41]:
# Using scikitlearn's LabelEncoder for categorical & object features
df_test1c = dummyEncode(df_test1c)
df_test1c.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.0,174,1996,214,633,0,3807,8,21,3,62,321,True,2,True,2012.0,9,6,3,3,2,0,2,6,2,3,3,5,3,1,6,5
1,0.0,247,1569,219,1727,5,2634,0,2,2,0,300,True,6,True,2000.0,3,1,0,7,4,0,2,6,2,2,2,8,6,0,1,1
2,0.0,247,1567,219,9483,0,5271,18,13,2,108,500,True,6,True,2010.0,9,6,3,7,4,0,2,6,2,2,2,5,3,1,6,5
3,0.0,220,267,259,5467,7,2710,7,80,43,48,250,True,6,True,1987.0,9,6,3,7,4,6,6,6,2,0,0,7,5,0,6,5
4,500.0,72,1260,75,5573,7,3442,16,10,3,60,60,True,9,True,2000.0,3,1,0,9,4,3,1,6,2,1,1,8,6,0,1,1


In [42]:
df_test1c.shape

(14358, 32)

### Spliting into Train & Validation

In [43]:
def split(x, y):
    # Hold out an "out-of-time" test set, from the last 100 days of data
    
    X_train = x[:-14358]
    X_val = x[-14358:]   # same shape as test data

    y_train  = y[:-14358]
    y_val  = y[-14358:]
    
    return X_train, X_val, y_train, y_val

In [44]:
X_train, X_val, y_train, y_val = split(df_en, labels.status_group)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((45042, 32), (14358, 32), (45042,), (14358,))

### Encoding Target Dataset (labels)

In [45]:
# encoding target features (y_train & y_val)
y_train_encoded = y_train.map({'non functional': 0,
                              'functional needs repair': 1, 'functional': 2})
y_val_encoded= y_val.map({'non functional': 0,
                              'functional needs repair': 1, 'functional': 2})

### Trying Out a Model with RandomForest

In [46]:
pipe = make_pipeline(
    RandomForestClassifier(n_estimators=4000, criterion ='entropy', n_jobs=3))

In [47]:
param_grid = {
    #'randomforestclassifier__n_estimators': [100],
    'randomforestclassifier__max_depth': [None]
}

In [48]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=4)

In [49]:
grid.fit(X_train, y_train_encoded)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] randomforestclassifier__max_depth=None ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  randomforestclassifier__max_depth=None, score=0.7984681984681985, total= 4.2min
[CV] randomforestclassifier__max_depth=None ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.5min remaining:    0.0s


[CV]  randomforestclassifier__max_depth=None, score=0.7953909684294659, total=23.3min
[CV] randomforestclassifier__max_depth=None ..........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 28.2min remaining:    0.0s


[CV]  randomforestclassifier__max_depth=None, score=0.7932458535935523, total= 3.6min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 32.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 32.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4000, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__max_depth': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [50]:
grid.best_score_

0.7957017894409663

In [51]:
grid.score(X_val, y_val_encoded)

0.7980220086363004

In [None]:
grid.best_estimator_

In [526]:
y_pred = grid.predict(df_test1c)
y_pred

array([0, 2, 2, ..., 2, 2, 0], dtype=int64)

In [527]:
model_cv3s2000_dict = {'id': test.id.values, 
                              'status_group': y_pred}
model_cv3s2000 = pd.DataFrame(data=model_cv3s2000_dict).set_index('id')

In [528]:
model_cv3s2000.status_group = model_cv3s2000.status_group.map({0:'non functional', 
                     1:'functional needs repair', 2:'functional'})

In [529]:
model_cv3s2000.shape

(14358, 1)

In [530]:
#model_cv3s2000.to_csv('model_cv3s2000.csv')

### MLPClassifier Model

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
pipe = make_pipeline(
    StandardScaler(),
    MLPClassifier([128, 32], max_iter=50, learning_rate='adaptive'))

In [None]:
param_grid = {
    'mlpclassifier__alpha': [0.05, .07, .1]
}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=3)

In [None]:
grid.fit(X_train, y_train_encoded)

In [None]:
grid.best_score_

In [None]:
grid.score(X_val, y_val_encoded)

In [None]:
grid.best_estimator_

In [None]:
df_clean.head()

## Dataset: One-Hot Encoded & Label Encoded 

In [444]:
df_oh = df

In [445]:
df_oh.amount_tsh.nunique()

98

In [None]:
(df_oh).dtypes

In [447]:
df_oh = pd.get_dummies(df_oh, columns=['amount_tsh', 'public_meeting', 'permit'], drop_first=True)

In [None]:
(df_oh).dtypes

In [449]:
# Creating encoded dataframe
# Using scikitlearn's LabelEncoder for categorical & object features
df_oh = dummyEncode(df_oh)
df_oh.head()

Unnamed: 0,funder,gps_height,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,population,scheme_management,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,amount_tsh_0.2,amount_tsh_0.25,amount_tsh_1.0,amount_tsh_2.0,amount_tsh_5.0,amount_tsh_6.0,amount_tsh_7.0,amount_tsh_9.0,amount_tsh_10.0,amount_tsh_12.0,amount_tsh_15.0,amount_tsh_20.0,amount_tsh_25.0,amount_tsh_26.0,amount_tsh_30.0,amount_tsh_33.0,amount_tsh_35.0,amount_tsh_40.0,amount_tsh_50.0,amount_tsh_53.0,amount_tsh_59.0,amount_tsh_60.0,amount_tsh_70.0,amount_tsh_100.0,amount_tsh_150.0,amount_tsh_200.0,amount_tsh_220.0,amount_tsh_250.0,amount_tsh_300.0,amount_tsh_306.0,amount_tsh_350.0,amount_tsh_400.0,amount_tsh_450.0,amount_tsh_500.0,amount_tsh_520.0,amount_tsh_550.0,amount_tsh_590.0,amount_tsh_600.0,amount_tsh_700.0,amount_tsh_750.0,amount_tsh_800.0,amount_tsh_900.0,amount_tsh_1000.0,amount_tsh_1200.0,amount_tsh_1300.0,amount_tsh_1400.0,amount_tsh_1500.0,amount_tsh_2000.0,amount_tsh_2200.0,amount_tsh_2400.0,amount_tsh_2500.0,amount_tsh_2800.0,amount_tsh_3000.0,amount_tsh_3500.0,amount_tsh_3600.0,amount_tsh_4000.0,amount_tsh_4500.0,amount_tsh_4700.0,amount_tsh_5000.0,amount_tsh_5400.0,amount_tsh_5500.0,amount_tsh_6000.0,amount_tsh_6300.0,amount_tsh_6500.0,amount_tsh_7000.0,amount_tsh_7200.0,amount_tsh_7500.0,amount_tsh_8000.0,amount_tsh_8500.0,amount_tsh_9000.0,amount_tsh_10000.0,amount_tsh_11000.0,amount_tsh_12000.0,amount_tsh_13000.0,amount_tsh_14000.0,amount_tsh_15000.0,amount_tsh_16000.0,amount_tsh_16300.0,amount_tsh_18000.0,amount_tsh_20000.0,amount_tsh_25000.0,amount_tsh_26000.0,amount_tsh_30000.0,amount_tsh_38000.0,amount_tsh_40000.0,amount_tsh_45000.0,amount_tsh_50000.0,amount_tsh_60000.0,amount_tsh_70000.0,amount_tsh_100000.0,amount_tsh_117000.0,amount_tsh_120000.0,amount_tsh_138000.0,amount_tsh_170000.0,amount_tsh_200000.0,amount_tsh_250000.0,amount_tsh_350000.0,public_meeting_True,permit_True
0,1369,1390,1518,37399,1,11807,3,11,5,51,109,7,1999.0,3,1,0,7,4,2,0,6,2,1,1,8,6,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,469,1399,545,37195,4,15838,9,20,2,103,280,2,2010.0,3,1,0,11,4,0,2,6,2,2,2,5,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,825,686,2048,14572,5,9074,8,21,4,108,250,7,2009.0,3,1,0,7,4,4,5,6,2,1,1,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,1741,263,1852,37285,7,8982,12,90,63,87,58,7,1986.0,14,10,5,7,4,0,2,6,2,0,0,3,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,20,0,119,35529,4,7698,4,18,1,26,0,7,1986.0,3,1,0,1,1,0,2,6,2,3,3,5,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [450]:
# df_oh.to_csv('df_onehot.csv', index=False)

In [451]:
df_oh.shape

(59400, 128)

## Test Dataset (target): One-Hot Encoded & Label Encoded 

In [498]:
df_test2 = df_test

In [499]:
df_test2.shape

(14358, 40)

In [500]:
df_test2 = df_test2.drop(['scheme_name', 'num_private',
                     'date_recorded', 'ward', 'recorded_by',
                     'latitude', 'longitude', 'id'], axis=1)

In [501]:
for col in fill_columns:
    df_test2[col] = df_test2[col].fillna(df_test2[col].mode()[0])

In [502]:
# Filling 0s in 'construction_year' with ffill method
df_test2['construction_year'] = df_test2['construction_year'].replace(0, np.nan)
df_test2['construction_year'] = df_test2['construction_year'].fillna(method='ffill')

In [503]:
df_test2 = pd.get_dummies(df_test2, columns=['amount_tsh', 'public_meeting', 'permit'], drop_first=True)

In [504]:
# Using scikitlearn's LabelEncoder for categorical & object features
df_test2 = dummyEncode(df_test2)
df_test2.head()

Unnamed: 0,funder,gps_height,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,population,scheme_management,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,amount_tsh_0.2,amount_tsh_0.5,amount_tsh_3.0,amount_tsh_5.0,amount_tsh_6.0,amount_tsh_7.0,amount_tsh_10.0,amount_tsh_15.0,amount_tsh_20.0,amount_tsh_25.0,amount_tsh_30.0,amount_tsh_33.0,amount_tsh_35.0,amount_tsh_40.0,amount_tsh_50.0,amount_tsh_60.0,amount_tsh_70.0,amount_tsh_100.0,amount_tsh_150.0,amount_tsh_200.0,amount_tsh_250.0,amount_tsh_300.0,amount_tsh_350.0,amount_tsh_400.0,amount_tsh_450.0,amount_tsh_500.0,amount_tsh_550.0,amount_tsh_600.0,amount_tsh_700.0,amount_tsh_750.0,amount_tsh_1000.0,amount_tsh_1200.0,amount_tsh_1500.0,amount_tsh_2000.0,amount_tsh_2200.0,amount_tsh_2400.0,amount_tsh_2500.0,amount_tsh_2550.0,amount_tsh_2800.0,amount_tsh_3000.0,amount_tsh_3500.0,amount_tsh_3600.0,amount_tsh_4000.0,amount_tsh_4700.0,amount_tsh_5000.0,amount_tsh_6000.0,amount_tsh_6500.0,amount_tsh_7000.0,amount_tsh_7200.0,amount_tsh_7500.0,amount_tsh_8000.0,amount_tsh_10000.0,amount_tsh_12000.0,amount_tsh_14000.0,amount_tsh_15000.0,amount_tsh_18000.0,amount_tsh_20000.0,amount_tsh_25000.0,amount_tsh_30000.0,amount_tsh_35000.0,amount_tsh_40000.0,amount_tsh_50000.0,amount_tsh_60000.0,amount_tsh_70000.0,amount_tsh_100000.0,amount_tsh_200000.0,public_meeting_True,permit_True
0,174,1996,214,633,0,3807,8,21,3,62,321,2,2012.0,9,6,3,3,2,0,2,6,2,3,3,5,3,1,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,247,1569,219,1727,5,2634,0,2,2,0,300,6,2000.0,3,1,0,7,4,0,2,6,2,2,2,8,6,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,247,1567,219,9483,0,5271,18,13,2,108,500,6,2010.0,9,6,3,7,4,0,2,6,2,2,2,5,3,1,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,220,267,259,5467,7,2710,7,80,43,48,250,6,1987.0,9,6,3,7,4,6,6,6,2,0,0,7,5,0,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,72,1260,75,5573,7,3442,16,10,3,60,60,9,2000.0,3,1,0,9,4,3,1,6,2,1,1,8,6,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [505]:
df_test2.shape

(14358, 97)

In [506]:
#df_test2.to_csv('df_test_oh_clean.csv')

### Spliting into Train & Validation

In [452]:
X_train2, X_val2, y_train2, y_val2 = split(df_oh, labels.status_group)

X_train2.shape, X_val2.shape, y_train2.shape, y_val2.shape

((45042, 128), (14358, 128), (45042,), (14358,))

In [453]:
# encoding target features (y_train & y_val)
y_train_encoded2 = y_train2.map({'non functional': 0,
                              'functional needs repair': 1, 'functional': 2})
y_val_encoded2 = y_val2.map({'non functional': 0,
                              'functional needs repair': 1, 'functional': 2})

### Trying a Model with Random Forest Classifier

In [458]:
pipe2 = make_pipeline(
    RandomForestClassifier(n_estimators=1200, criterion ='entropy', n_jobs=3))

In [459]:
param_grid2 = {
    #'randomforestclassifier__n_estimators': [100],
    'randomforestclassifier__max_depth': [None]
}

In [460]:
grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=8, verbose=4)

In [461]:
grid2.fit(X_train2, y_train_encoded2)

Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.8094815340909091, total= 1.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.8058959332267803, total= 1.1min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s


[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.800923459421062, total= 1.2min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.9min remaining:    0.0s


[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.7911561001598295, total= 1.2min
[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.8103019538188277, total= 1.2min
[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.7976550008882572, total= 1.2min
[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.800497424054006, total= 1.1min
[CV] randomforestclassifier__max_depth=None ..........................
[CV]  randomforestclassifier__max_depth=None, score=0.7955231835139457, total= 1.6min


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 10.9min finished


GridSearchCV(cv=8, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__max_depth': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [466]:
grid2.best_score_

0.8014297766529017

In [467]:
grid2.score(X_val2, y_val_encoded2)

0.7978827134698426

In [468]:
grid2.best_estimator_

Pipeline(memory=None,
     steps=[('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [507]:
y_pred2 = grid2.predict(df_test2)
y_pred2

ValueError: Number of features of the model must match the input. Model n_features is 128 and input n_features is 97 

In [None]:
model_oh1200_dict = {'id': test.id.values, 
                              'status_group': y_pred2}
model_oh1200 = pd.DataFrame(data=model_oh1200_dict).set_index('id')

In [None]:
model_oh1200.status_group = model_oh1200.status_group.map({0:'non functional', 
                     1:'functional needs repair', 2:'functional'})

In [None]:
model_oh1200.shape

In [None]:
model_oh1200.to_csv('model_oh1200.csv')

## Gradient Boosting Classifier

In [None]:
pipe = make_pipeline(
    GradientBoostingClassifier(n_estimators=300))

In [None]:
param_grid = {
    #'gradientboostingclassifier__n_estimators': [100],
    'gradientboostingclassifier__learning_rate': [1]
}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=4, verbose=4)

In [None]:
grid.fit(X_train, y_train_encoded)

In [None]:
grid.best_score_

In [None]:
grid.score(X_val, y_val_encoded)

In [None]:
grid.best_estimator_

### MLPClassifier Model

In [None]:
pipe = make_pipeline(
    StandardScaler(),
    MLPClassifier([128, 32], max_iter=50, learning_rate='adaptive'))

In [None]:
param_grid = {
    'mlpclassifier__alpha': [0.05, .07, .1]
}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=3)

In [None]:
grid.fit(X_train, y_train_encoded)

In [None]:
grid.best_score_

In [None]:
grid.score(X_val, y_val_encoded)

In [None]:
grid.best_estimator_