In [10]:
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = 'data/'

In [11]:
import pandas as pd

# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

train.shape, test.shape

((59400, 41), (14358, 40))

In [13]:

def wrangle(X):
      
    # Prevent errors with propogation of changes to master matrix
    X = X.copy()
    
    # Treat near zero as zero
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace the zeros with nulls, and impute missing values later.
    # Also create a "missing indicator" column, because the fact that
    # values are missing may be a predictive signal.
    cols_with_zeros = ['longitude', 'latitude', 'construction_year', 
                       'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col+'_MISSING'] = X[col].isnull()
            
    # Drop duplicate columns
    duplicates = ['quantity_group', 'payment_type']
    X = X.drop(columns=duplicates)
    
    # Drop unusable features
    unusable_variance = ['recorded_by', 'id']
    X = X.drop(columns=unusable_variance)
    
#     # Convert date_recorded to datetime
#     X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
# #     Extract components from date_recorded, then drop the original column
#     X['year_recorded'] = X['date_recorded'].dt.year
#     X['month_recorded'] = X['date_recorded'].dt.month
#     X['day_recorded'] = X['date_recorded'].dt.day
#     X = X.drop(columns='date_recorded')
    
# #     Engineer feature: how many years from construction_year to date_recorded
#     X['years'] = X['year_recorded'] - X['construction_year']
#     X['years_MISSING'] = X['years'].isnull()
    
    # return the wrangled dataframe
    return X

In [14]:

def split_wrangle(train,test):
  """Creates a 3-way train,val,test split and wrangles data. 
  Returns train, test, val sets"""
  train, val = train_test_split(train, train_size=0.80, test_size=0.20,
                              stratify=train['status_group'], random_state=20)
  # Use wrangle function on split data sets
  a = wrangle(train)  
  b = wrangle(val)
  c = wrangle(test)

  return a,b,c

train, val, test = split_wrangle(train,test)
train.shape, val.shape, test.shape

((47520, 42), (11880, 42), (14358, 41))

In [15]:
# Organize into target vector and feature matrices
target = 'status_group'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [16]:
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import f_regression,SelectKBest
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score

In [17]:
# Create modeling pipeline for Hyperperameter optimization
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1,random_state=20)
)
param_distributions = {
    'simpleimputer__strategy': ['median','mean','most_frequent'],
    'randomforestclassifier__n_estimators': randint(50, 500),
    'randomforestclassifier__max_depth': (20, 25, 1),
    'randomforestclassifier__max_features': uniform(0, 1),
    'randomforestclassifier__min_samples_leaf': randint(1,10),
    'randomforestclassifier__bootstrap': [0,1]
}
search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='accuracy', 
    verbose=50, 
    return_train_score=True, 
    n_jobs=-1,
    random_state=20
)


In [18]:
search.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   2

In [19]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation MAE', -search.best_score_)

Best hyperparameters {'randomforestclassifier__bootstrap': 0, 'randomforestclassifier__max_depth': 20, 'randomforestclassifier__max_features': 0.7702519331384002, 'randomforestclassifier__min_samples_leaf': 7, 'randomforestclassifier__n_estimators': 381, 'simpleimputer__strategy': 'most_frequent'}
Cross-validation MAE -0.8008627946127946


In [20]:
pipeline = search.best_estimator_

In [21]:
pipeline.fit(X_train, y_train)
print ('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8085016835016835


In [22]:
y_pred = pipeline.predict(X_test)
y_pred

array(['non functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [23]:
# sample_submission = pd.read_csv('https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/waterpumps/sample_submission.csv')
# submission = sample_submission.copy()
# submission['status_group'] = y_pred
# submission.to_csv('tyler-etheridge-sub4.csv', index=False)