<a href="https://colab.research.google.com/github/bundickm/Predictive_Preventative_Maintenance/blob/master/notebooks/Simplified_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports, Installs, and Initialization

In [4]:
pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
[K     |███▎                            | 10kB 17.9MB/s eta 0:00:01[K     |██████▌                         | 20kB 3.1MB/s eta 0:00:01[K     |█████████▉                      | 30kB 4.4MB/s eta 0:00:01[K     |█████████████                   | 40kB 2.9MB/s eta 0:00:01[K     |████████████████▍               | 51kB 3.6MB/s eta 0:00:01[K     |███████████████████▋            | 61kB 4.2MB/s eta 0:00:01[K     |██████████████████████▉         | 71kB 4.9MB/s eta 0:00:01[K     |██████████████████████████▏     | 81kB 5.5MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92kB 6.2MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.7MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0


In [0]:
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [0]:
# Make sure we can see all columns
pd.set_option('display.max_columns', 100)

def reset():
    #Create a function to quickly and easily reset our data
    
    # Read in all CSV's
    X_train = pd.read_csv('https://raw.githubusercontent.com/bundickm/Kaggle_Water_Pump_Competition/master/train_features.csv')
    X_test = pd.read_csv('https://raw.githubusercontent.com/bundickm/Kaggle_Water_Pump_Competition/master/test_features.csv')
    y_train = pd.read_csv('https://raw.githubusercontent.com/bundickm/Kaggle_Water_Pump_Competition/master/train_labels.csv')

    # Split X_train to training and validation
    return train_test_split(X_train, y_train, random_state=42, test_size=.2)


X_train, X_val, y_train, y_val = reset()

## Cleaning and Feature Engineering

In [0]:
def correct_distribution_issues(df):
    df = df.copy()

    # amount_tsh
    mask = (df['amount_tsh'] > 1500)
    tsh_mean = df[df['amount_tsh'] > 0]['amount_tsh'].mean()
    df.loc[mask, 'amount_tsh'] = tsh_mean

    # population
    df.loc[df['population'] == 0, 'population'] = df.groupby(
                                 'region_code')['population'].transform('mean')

    return df

In [0]:
def to_lowercase(df):
    df = df.copy()
    
    cols = df.select_dtypes(include='object').columns
    for col in cols:
      try:
        df[col] = df[col].str.lower()
      except:
        pass
    return df

In [0]:
def lat_long_correction(df):
    df = df.copy()
    
    # latitude has a floating point precision error so we are selecting via the `>` comparison operator
    df.loc[df['longitude'] == 0, 'longitude'] = df.groupby(
                                 'region_code')['longitude'].transform('mean')
    df.loc[df['latitude'] > -.001, 'latitude'] = df.groupby(
                                 'region_code')['latitude'].transform('mean')
    
    return df

In [0]:
def cleaning(df):
    df = df.copy()
    df = to_lowercase(df)
    df = lat_long_correction(df)
    df = correct_distribution_issues(df)
    
    return df

In [0]:
def feature_engineer(df):
    df = df.copy()
        
    df = cleaning(df)

    df = df[['quantity', 'extraction_type_class', 'waterpoint_type',
             'amount_tsh', 'population', 'latitude', 'longitude']]
    return df

In [0]:
X_train, X_val, y_train, y_val = reset()
X_train = feature_engineer(X_train)

In [0]:
# Encode our categoricals
encoder = ce.OrdinalEncoder()
X_train = encoder.fit_transform(X_train)

In [29]:


forest = RandomForestClassifier(n_jobs=-1, random_state=42)

param_distributions = {'n_estimators':[200],
                       'max_depth':[20]}

search = RandomizedSearchCV(forest, param_distributions=param_distributions,
                            scoring='accuracy', n_iter=1, n_jobs=-1, cv=5,
                            verbose=10, return_train_score=True, 
                            random_state=42)
search.fit(X_train, y_train['status_group'])

print('Training Accuracy Score:', search.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   38.2s remaining:   25.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   47.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   47.6s finished


Training Accuracy Score: 0.7880892255892256


In [30]:
best = search.best_estimator_
X_val = feature_engineer(X_val)
X_val = encoder.transform(X_val)
y_pred = best.predict(X_val)
print('Validation Set Accuracy Score:', 
      accuracy_score(y_val['status_group'], y_pred))

Validation Set Accuracy Score: 0.7956228956228957


In [34]:
dump(best, 'model.joblib', compress=True)

['model.joblib']