In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

In [82]:
data = pd.read_csv('data/train.csv', index_col='id')
data.head()

X = data.drop('Exited', axis=1)
y = data['Exited']

X.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83


In [83]:
# Add features

X['BalanceSalaryRatio'] = X['Balance'] / X['EstimatedSalary'] 
X['TenureByAge'] = X['Tenure'] / X['Age']


In [84]:
from sklearn.model_selection import train_test_split

X.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42)

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

num_attribs = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'BalanceSalaryRatio', 'TenureByAge']
cat_attribs = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

preprocessing_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

In [86]:
X_train = preprocessing_pipeline.fit_transform(X_train)
X_val = preprocessing_pipeline.transform(X_val)
X_test = preprocessing_pipeline.transform(X_test)

# Training Various Models

In [71]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [88]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'n_estimators': np.arange(30, 76), 'max_features': np.arange(1, 11), 'max_depth': np.arange(1, 11), 'min_samples_split': np.arange(2, 11), 'min_samples_leaf': np.arange(1, 11)}


forest_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5,
                           scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=50)
forest_search.fit(X_train, y_train)
print(forest_search.best_params_)
print(forest_search.best_score_)
y_pred = forest_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

{'n_estimators': 52, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 8, 'max_depth': 9}
0.8644202811736708
0.8647880041365047


In [77]:
forest = RandomForestClassifier(**forest_search.best_params_)
forest.fit(X_train, y_train)
# view the feature scores
feature_scores = pd.Series(forest.feature_importances_).sort_values(ascending=False)

In [73]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

param_grid = {'n_estimators': np.arange(30, 76), 'learning_rate': np.arange(0.1, 1.1, 0.1), 'estimator__min_samples_split': np.arange(2, 11), 'estimator__min_samples_leaf': np.arange(1, 11), 'estimator__max_features': np.arange(1, 11), 'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)]}

ada_search = RandomizedSearchCV(AdaBoostClassifier(), param_grid, cv=5,
                            scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=50)
ada_search.fit(X_train, y_train)
print(ada_search.best_params_)
print(ada_search.best_score_)
y_pred = ada_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

{'n_estimators': 55, 'learning_rate': 0.2, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 3, 'estimator__max_features': 8, 'estimator': DecisionTreeClassifier(max_depth=3)}
0.864215603640158
0.8616533092037229


In [11]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'n_estimators': np.arange(30, 76), 'learning_rate': np.arange(0.1, 1.1, 0.1), 'min_samples_split': np.arange(2, 11), 'min_samples_leaf': np.arange(1, 11), 'max_features': np.arange(1, 11), 'max_depth': np.arange(1, 11)}

gb_search = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=50)
gb_search.fit(X_train, y_train)
print(gb_search.best_params_)
print(gb_search.best_score_)
y_pred = gb_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

{'n_estimators': 32, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 10, 'max_depth': 7, 'learning_rate': 0.1}
0.8642976991794903
0.8646910548086867


In [13]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': np.arange(10, 30),
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': np.arange(1, 51)
}

knn_search = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=5,
                            scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=20)
knn_search.fit(X_train, y_train)
print(knn_search.best_params_)
print(knn_search.best_score_)
y_pred = knn_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

{'weights': 'distance', 'p': 2, 'n_neighbors': 20, 'leaf_size': 20}
0.8575757786434386
0.8569674250258531


In [14]:
from sklearn.svm import SVC

svc = SVC(C=1, degree=2, gamma='scale', kernel='rbf', probability=True)
svc.fit(X_train, y_train)
print(svc.score(X_train, y_train))
y_pred = svc.predict(X_val)
print(accuracy_score(y_val, y_pred))



{'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 1.0}
0.8625597557811913
0.8608130816959669


In [28]:
from sklearn.ensemble import ExtraTreesClassifier

param_grid = {'n_estimators': np.arange(30, 76), 'max_features': np.arange(1, 11), 'max_depth': np.arange(1, 11), 'min_samples_split': np.arange(2, 11), 'min_samples_leaf': np.arange(1, 11)}

extra_search = RandomizedSearchCV(ExtraTreesClassifier(), param_grid, cv=5,
                            scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=50)
extra_search.fit(X_train, y_train)
print(extra_search.best_params_)
print(extra_search.best_score_)
y_pred = extra_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

{'n_estimators': 33, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 10, 'max_depth': 9}
0.862430474331805
0.8626228024819028


In [43]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
mlp.fit(X_train, y_train)
print(mlp.score(X_train, y_train))
y_pred = mlp.predict(X_val)
print(accuracy_score(y_val, y_pred))

0.8903666891688091
0.8416817476732161




In [35]:
from sklearn.ensemble import StackingClassifier

param_grid = {
    'final_estimator__penalty': ['l1', 'l2'],
    'final_estimator__C': np.arange(0.1, 1.1, 0.1),
    'final_estimator__solver': ['liblinear', 'saga'],
    'stack_method': ['auto', 'predict_proba'],
    # random selection of estimators
    'estimators': np
    ]
}

stacking_search = RandomizedSearchCV(StackingClassifier(estimators=[('gb', gb_search.best_estimator_), ('forest', forest_search.best_estimator_), ('knn', knn_search.best_estimator_), ('ada', ada_search.best_estimator_)], final_estimator=LogisticRegression()), param_grid, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=10, verbose=2)
stacking_search.fit(X_train, y_train)
print(stacking_search.best_params_)
print(stacking_search.best_score_)
y_pred = stacking_search.predict(X_val)
print(accuracy_score(y_val, y_pred))


Fitting 5 folds for each of 10 candidates, totalling 50 fits




[CV] END final_estimator__C=0.6, final_estimator__penalty=l2, final_estimator__solver=saga, stack_method=predict_proba; total time= 1.5min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l2, final_estimator__solver=saga, stack_method=predict_proba; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l2, final_estimator__solver=saga, stack_method=predict_proba; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l2, final_estimator__solver=saga, stack_method=predict_proba; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l2, final_estimator__solver=saga, stack_method=predict_proba; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l1, final_estimator__solver=liblinear, stack_method=predict_proba; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l1, final_estimator__solver=liblinear, stack_method=predict_proba; total time= 1.7min
[CV] END final_es



[CV] END final_estimator__C=0.8, final_estimator__penalty=l2, final_estimator__solver=liblinear, stack_method=auto; total time= 1.5min
[CV] END final_estimator__C=0.8, final_estimator__penalty=l2, final_estimator__solver=liblinear, stack_method=auto; total time= 1.6min
[CV] END final_estimator__C=0.8, final_estimator__penalty=l2, final_estimator__solver=liblinear, stack_method=auto; total time= 1.6min
[CV] END final_estimator__C=0.8, final_estimator__penalty=l2, final_estimator__solver=liblinear, stack_method=auto; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l1, final_estimator__solver=liblinear, stack_method=predict_proba; total time= 1.7min
[CV] END final_estimator__C=0.8, final_estimator__penalty=l2, final_estimator__solver=liblinear, stack_method=auto; total time= 1.6min
[CV] END final_estimator__C=0.6, final_estimator__penalty=l1, final_estimator__solver=liblinear, stack_method=predict_proba; total time= 1.7min
[CV] END final_estimator__C=0.4, fina

In [36]:
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': np.linspace(0.0001, 0.001, 10),
    'learning_rate': ['constant', 'adaptive'],
}

mlp_search = RandomizedSearchCV(MLPClassifier(), param_grid, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, n_iter=10, verbose=2)
mlp_search.fit(X_train, y_train)
print(mlp_search.best_params_)
print(mlp_search.best_score_)
y_pred = mlp_search.predict(X_val)
print(accuracy_score(y_val, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [37]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_val)
print(accuracy_score(y_val, y_pred))

0.8633337642192348


In [42]:
mlp = MLPClassifier(activation='tanh', alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate='adaptive', solver='adam')
mlp.fit(X_train, y_train)
print(mlp.score(X_train, y_train))
y_pred = mlp.predict(X_val)
print(accuracy_score(y_val, y_pred))

0.8749551154072649
0.858906411582213


