In [5]:
import pandas as pd
import category_encoders as ce
import numpy as np

train_feat_url = 'https://raw.githubusercontent.com/will-cotton4/A-LSDS-prediction-kaggle/master/train_features.csv'
train_label_url = 'https://raw.githubusercontent.com/will-cotton4/A-LSDS-prediction-kaggle/master/train_labels.csv'
test_feat_url = 'https://raw.githubusercontent.com/will-cotton4/A-LSDS-prediction-kaggle/master/test_features.csv'

X = pd.read_csv(train_feat_url)
y = pd.read_csv(train_label_url)
X_test = pd.read_csv(test_feat_url)

def wrangle_labels(y):
  y = y.copy()
  y = y.set_index('id')
  label_dict = {'functional':2, 'functional needs repair': 1, 'non functional': 0}
  y = y.replace(label_dict)
  return y

y = wrangle_labels(y)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, 
                                                  random_state=42, stratify=y)

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer 

def wrangle(X):
  X = X.copy()
  X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
  X['date_recorded_month'] = X['date_recorded'].dt.month.astype('object')
  X['date_recorded_year'] = X['date_recorded'].dt.year.astype('object')
  X['date_recorded_day'] = X['date_recorded'].dt.day.astype('object')
  X = X.drop(columns='date_recorded')
  
  imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
  X =  pd.DataFrame(imp_mode.fit_transform(X), columns=X.columns)
  drop_cols = ['longitude', 'latitude']

  #   X = X.drop(columns=drop_cols)
#   cat_cols = X.select_dtypes(['object']).columns.tolist()

#   binary_ce = ce.BinaryEncoder(cols=cat_cols, verbose=10, return_df=True)

#   X = binary_ce.fit_transform(X)
#   # Remove unnecessary/sparse features
  
#   drop_cols = ['longitude', 'latitude', 'region', 'recorded_by', 'wpt_name', 'num_private',
#                'scheme_management', 'scheme_name', 'extraction_type', # Might add scheme_management back
#               'extraction_type_group', 'quantity_group', 'quality_group'] # These group features might be useful depending on how the cat. encoding goes
  
  
#   # One-hot encoding:
#   to_one_hot = ['extraction_type_class', 'waterpoint_type_group', 
#                 'management_group', 'payment_type', 'source_class',
#                 'basin']
#   X = pd.get_dummies(X, prefix_sep='_', columns=to_one_hot)

#   # Ordinal encoding:
  
#   # Binary encoding:
#   to_binary = ['public_meeting']
  
#   # Future cleaning:
#   to_be_cleaned = ['funder', 'installer', 'date_recorded', 'subvillage',
#                   'public_meeting', 'permit', 'lga', 'ward', 'management', 
#                    'payment', 'water_quality',
#                    'quantity', 'source', 'source_type', 'waterpoint_type']
#   X = X.drop(columns=to_be_cleaned)
  
  
  return X


X_train = wrangle(X_train)
X_val = wrangle(X_val)
X_test = wrangle(X_test)

cat_cols = X_train.select_dtypes(['object']).columns.tolist()

binary_ce = ce.BinaryEncoder(cols=cat_cols, verbose=10, return_df=True)

X_train = binary_ce.fit_transform(X_train)
X_val = binary_ce.transform(X_val)
X_test = binary_ce.transform(X_test)

## Implementing Kaggle RSCV code on AWS

In [4]:
!pip install xgboost
!pip install category-encoders

[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
%time
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

param_distributions = {
    'n_estimators': randint(100,1000), 
    'max_depth': randint(30,100)
}

gridsearch = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42), 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=3, 
    scoring='accuracy', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=2
)

gridsearch.fit(X_train, y_train)

print(gridsearch.best_score_)

estimator = gridsearch.best_estimator_

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.68 µs
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   19.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   37.2s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   58.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:  3.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


0.7944654882154882


In [8]:
%time
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

param_distributions = {
    'n_estimators': randint(100,1000), 
    'max_depth': randint(30,100)
}

gridsearch = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=16, random_state=42), 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=3, 
    scoring='accuracy', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

gridsearch.fit(X_train, y_train)

print(gridsearch.best_score_)

estimator = gridsearch.best_estimator_

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  30 | elapsed:   35.3s remaining:  5.3min
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed:  1.2min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:  1.5min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  1.7min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:  2.2min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  2.6min remaining:   48.2s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  3.0min remaining:   19.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


0.7945917508417508
