In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost

In [2]:
df = pd.read_csv('./train_test_data/prepared_training_data.csv',index_col='id')

test_df = pd.read_csv('./train_test_data/prepared_test_data.csv',index_col='id')
#df.drop(columns='permit',inplace=True)
#test_df.drop(columns='permit',inplace=True)


In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier, plot_importance, plot_tree

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score,f1_score,cohen_kappa_score,plot_confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFECV

In [7]:
# set X and y variables
X = df.drop(columns='status_group')
Y = df.status_group

# get list of predictors to OHE
object_predictors = list(X.select_dtypes(include=['object']).columns)
test_object_predictors = list(test_df.select_dtypes(include=['object']).columns)

In [8]:
# Check Columns are good
X.columns

Index(['date_recorded', 'gps_height', 'longitude', 'latitude', 'region',
       'district_code', 'population', 'scheme_management', 'permit',
       'extraction_type', 'management', 'quality_group', 'quantity',
       'source_type', 'waterpoint_type', 'tsh_bins', 'top_funders',
       'top_installers', 'population_size', 'construction_decade'],
      dtype='object')

In [9]:
# Check columns are good
test_df.columns

Index(['date_recorded', 'gps_height', 'longitude', 'latitude', 'region',
       'district_code', 'scheme_management', 'permit', 'extraction_type',
       'management', 'quality_group', 'quantity', 'source_type',
       'waterpoint_type', 'tsh_bins', 'top_funders', 'top_installers',
       'population_size', 'construction_decade'],
      dtype='object')

In [10]:
 # Make Column transformer for object columns and ignore the rest
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), object_predictors),
    remainder='passthrough')

 # Make Column transformer for object columns and ignore the rest
test_column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), test_object_predictors),
    remainder='passthrough')

# Fit column transformers
X = column_trans.fit_transform(X)
test_data = test_column_trans.fit_transform(test_df)

In [11]:
print(X.shape)
print(test_data.shape)

(54532, 123)
(14850, 122)


### Train Test Split

In [12]:
# Train test split for model training data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=.2,
    random_state=0)

In [14]:
# tune the model
n_estimators_1 = [300]
max_depth_1 = [20,30,40]
learning_rate_1 = [0.1]
min_child_weight_1 = [1]
reg_lamba_1 = [10,20,25]
subsample_1 = [0.5,0.75,1]

# Create dictionary of parameters to search
param_grid = dict(max_depth=max_depth_1,
                  n_estimators=n_estimators_1,
                  learning_rate=learning_rate_1,
                  min_child_weight=min_child_weight_1,
                  reg_lambda=reg_lamba_1,
                  subsample=subsample_1,
                 )

In [15]:
# Cross Validate and gridsearch
clf = XGBClassifier(objective='multi:softmax',early_stopping_rounds=10)
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
grid_search = GridSearchCV(clf,param_grid,scoring='accuracy',n_jobs=8,cv=kfold,verbose=1)
grid_result = grid_search.fit(X, Y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Plot Confusion matrix to see how best estimator does
plot_confusion_matrix(grid_result.best_estimator_, X, Y)
plt.xticks(rotation=45)
plt.figure(figsize=(10,10))

In [None]:
# Make predictions to be submitted
test_preds = grid_result.best_estimator_.predict(test_data)

#### Send Test to CSV to submit

In [None]:
# View normalized breakdown of data to check ratios
pd.Series(test_preds).value_counts(normalize=True)*100

In [None]:
# Check DF
test_df.reset_index(inplace=True)
test_df['status_group'] = test_preds
submit_df = test_df[['id','status_group']]

In [None]:
submit_df.head()

In [None]:
pd.DataFrame.to_csv(submit_df,path_or_buf='./Predictions/iteration_7_csv',index=False)

In [None]:
import pickle

# pickle the model.
with open(f'./{grid_result.best_score_:.4}_iteration_5.sav','wb') as f:
     pickle.dump(clf,f)



In [None]:
import pickle
#load it back in to see if everything works
with open('0.8005330203672462_iteration_3.sav', 'rb') as pickle_file:
     clf = pickle.load(pickle_file)

In [None]:
clf.predict(test_data)