In [32]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from xgboost import XGBClassifier
import joblib

In [2]:
FOLDS = 5
SEED = 72

In [3]:
def create_folds(df, k=5):
    df['kfold'] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.Potability.values
    kf = model_selection.StratifiedKFold(n_splits=k)
    for f, (_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
    return df

In [97]:
df = pd.read_csv("datasets/proc_water_potability.csv")
df = create_folds(df, FOLDS)

In [98]:
df['kfold'].value_counts()

0    656
2    655
4    655
1    655
3    655
Name: kfold, dtype: int64

In [99]:
def run(fold, model, model_name):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    x_train = df_train.drop(['Potability', 'kfold'], axis=1).values
    y_train = df_train.Potability.values

    x_valid = df_valid.drop(['Potability', 'kfold'], axis=1).values
    y_valid = df_valid.Potability.values

    model.fit(x_train, y_train)

    train_preds = model.predict(x_train)
    valid_preds = model.predict(x_valid)
    
    train_accuracy = metrics.accuracy_score(y_train, train_preds)
    valid_accuracy = metrics.accuracy_score(y_valid, valid_preds)
    print(f"=============Fold:{fold}=============")
    print(f"Train Accuracy={train_accuracy}")
    print(f"Valid Accuracy={valid_accuracy}")

    joblib.dump(model, f"models/{model_name}_{fold}.bin")

    return valid_accuracy

In [38]:
tree.DecisionTreeClassifier?

[1;31mInit signature:[0m
[0mtree[0m[1;33m.[0m[0mDecisionTreeClassifier[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0msplitter[0m[1;33m=[0m[1;34m'best'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_split[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [

In [100]:
def train(model, model_name, folds=FOLDS):
    accuracies = []
    for fold in range(folds):
        model.random_state = fold
        acc = run(fold, model, model_name)
        accuracies.append(acc)
    print(f"\nAverage Accuracy: {sum(accuracies)/len(accuracies)}")

In [151]:
model = tree.DecisionTreeClassifier(max_depth=50, min_samples_split=20,                                min_samples_leaf=10, max_features="auto")
train(model, "dt", FOLDS)

Train Accuracy=0.7725190839694657
Valid Accuracy=0.5335365853658537
Train Accuracy=0.7699351392598245
Valid Accuracy=0.6076335877862595
Train Accuracy=0.7748950782144219
Valid Accuracy=0.6137404580152672
Train Accuracy=0.7729874093857306
Valid Accuracy=0.5954198473282443
Train Accuracy=0.7642121327737504
Valid Accuracy=0.5954198473282443

Average Accuracy: 0.5891500651647739


In [41]:
ensemble.RandomForestClassifier?

[1;31mInit signature:[0m
[0mensemble[0m[1;33m.[0m[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_split[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m  

In [152]:
model = ensemble.RandomForestClassifier(n_estimators=50, max_depth=50, min_samples_split=20,                                min_samples_leaf=10, max_features="auto", n_jobs=-1)
train(model, "rf", FOLDS)

Train Accuracy=0.8465648854961833
Valid Accuracy=0.6646341463414634
Train Accuracy=0.8359404807325448
Valid Accuracy=0.6854961832061068
Train Accuracy=0.8431896222815719
Valid Accuracy=0.648854961832061
Train Accuracy=0.8431896222815719
Valid Accuracy=0.6381679389312978
Train Accuracy=0.8409004196871424
Valid Accuracy=0.6793893129770993

Average Accuracy: 0.6633085086576057


In [55]:
df_0 = df[df['kfold'] != 0].reset_index(drop=False)
X = df_0.drop('Potability', axis=1).values
y = df_0.Potability.values

clf = ensemble.RandomForestClassifier(n_jobs=-1)
param_grid = {
    "n_estimators":  np.arange(100, 1500, 100),
    "max_depth": np.arange(1, 100),
    "criterion": ["gini", "entropy"],
    "min_samples_split": np.arange(2, 25),
    "min_samples_leaf": np.arange(5, 30),
    "max_features": ["sqrt", "log2"],
    "class_weight" : ["balanced", "balanced_subsample"]
}

In [56]:
model_selection.RandomizedSearchCV?

[1;31mInit signature:[0m
[0mmodel_selection[0m[1;33m.[0m[0mRandomizedSearchCV[0m[1;33m([0m[1;33m
[0m    [0mestimator[0m[1;33m,[0m[1;33m
[0m    [0mparam_distributions[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mn_iter[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [0mscoring[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrefit[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcv[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mpre_dispatch[0m[1;33m=[0m[1;34m'2*n_jobs'[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0merror_score[0m[1;33m=[0m[0mnan[0m[1;33m,[0m[1;33m
[0m    [0mreturn_train_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m

In [153]:
model = model_selection.RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_grid,
    n_iter=25,
    scoring="accuracy",
    verbose=10, 
    n_jobs=-1,
    cv=5,
    random_state=SEED
)

model.fit(X, y)
print(f"Best Score: {model.best_score_}")
print("Best Parameter Set:")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_params[param_name]}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits


KeyboardInterrupt: 

In [154]:
clf = ensemble.RandomForestClassifier(
    class_weight= "balanced",
	criterion= "gini",
	max_depth= 60,
	max_features= "log2",
	min_samples_leaf= 7,
	min_samples_split= 18,
	n_estimators= 700,
    n_jobs=-1
)
train(clf, "rf", FOLDS)

Train Accuracy=0.9545801526717558
Valid Accuracy=0.6722560975609756
Train Accuracy=0.9538344143456696
Valid Accuracy=0.683969465648855
Train Accuracy=0.9523082792827166
Valid Accuracy=0.6381679389312978
Train Accuracy=0.9496375429225486
Valid Accuracy=0.6427480916030535
Train Accuracy=0.9519267455169782
Valid Accuracy=0.6732824427480916

Average Accuracy: 0.6620848072984546


In [155]:
models = []
model_name = "rf"
for fold in range(FOLDS):
    models.append(joblib.load(f"models/{model_name}_{fold}.bin"))
print(f"Total Models: {len(models)}")

Total Models: 5
