Each team will develop a machine learning model using those available in `scikit-learn` to predict whether or not a particular will elect to stop services provided by a financial institution based on known attributes. The first five rows of the training data are shown below.

In [1]:
import json
import pathlib
import warnings
import joblib
warnings.filterwarnings("ignore")
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,597,Germany,Female,35,8,131101.04,1,1,1,192852.67,0
1,523,France,Female,40,2,102967.41,1,1,0,128702.1,1
2,706,Spain,Female,42,8,95386.82,1,1,1,75732.25,0
3,788,France,Male,32,4,112079.58,1,0,0,89368.59,0
4,706,Germany,Male,38,5,163034.82,2,1,1,135662.17,0


Each team will deploy there models to Azure. Specifically, each team should use `Flask` to deploy a web service that can accept JSON payloads via a `post` request. An example of a typical JSON payload that should be expected is shown below.

In [2]:
target = 'Exited'
features = [col for col in data.columns if col != target]

data[features].loc[0].to_dict()

{'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67}

**This will be a graded assignment!** Points will be allocated as follows:
- 70 points for a working deployment
- 20 points for a model that achieves a predictive accuracy greater than 80%.
- 10 points based on competition

The model competition will take place on Monday, 2/21. Each model will be asked to make predictions for > 30 unseen customers. The winning model will be the one that achieves the **highest predictive accuracy**. Teams with the **best and worst** performing models will give a overview of their modeling pipeline (all teams should be prepared to speak to this). This overview should include discussions on:
- data preparation steps,
- model selection, and
- model tuning.

In [3]:
data.shape

(9970, 11)

In [4]:
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
dummy_column_mapper = {}
for col in data.columns:
    if data[col].dtype == 'object':
        temp = pd.get_dummies(data[col], prefix=col, drop_first=True)
        data = data.drop(columns=[col])
        data[temp.columns] = temp
        dummy_column_mapper[col] = temp.columns.tolist()
        
with open('dummy_column_mapper.json', 'w') as fout:
    json.dump(dummy_column_mapper, fout)

with open('col_order.json', 'w') as fout:
    json.dump(data.columns.tolist(), fout)
    
for col in data.columns:
    data[col] = data[col].astype("float")

data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,597.0,35.0,8.0,131101.04,1.0,1.0,1.0,192852.67,0.0,1.0,0.0,0.0
1,523.0,40.0,2.0,102967.41,1.0,1.0,0.0,128702.1,1.0,0.0,0.0,0.0
2,706.0,42.0,8.0,95386.82,1.0,1.0,1.0,75732.25,0.0,0.0,1.0,0.0
3,788.0,32.0,4.0,112079.58,1.0,0.0,0.0,89368.59,0.0,0.0,0.0,1.0
4,706.0,38.0,5.0,163034.82,2.0,1.0,1.0,135662.17,0.0,1.0,0.0,1.0


In [6]:
dummy_column_mapper

{'Geography': ['Geography_Germany', 'Geography_Spain'],
 'Gender': ['Gender_Male']}

In [None]:
data.shape

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

sns.heatmap(
    data.corr(),
    cmap='coolwarm',
    linewidths=0.1,
    linecolor='k',
    annot=True,
    fmt='.0%',
)

plt.show()

Everything looks to be within managable correlation values. 

In [None]:
data.describe().round(2)

In [None]:
target = 'Exited'
features = [col for col in data.columns if col != target]
binary_columns = [col for col in features if sorted(data[col].unique().tolist()) == [0, 1]]

X = data[features].copy()
y = data[target]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    random_state=42,
)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)

In [None]:
scaler.mean_

In [None]:
scaler.scale_

In [None]:
X_train['Age'].mean()

In [None]:
X_train['Age'].std()

In [None]:
scaler_filepath = pathlib.Path('scaler_info.json')

scaler_dict  = {}
for feature, mean, scale in zip(features, scaler.mean_, scaler.scale_):
    if feature in binary_columns:
        scaler_dict[feature] = {
            'mean': 0,
            'std': 1,
        }
    else:
        scaler_dict[feature] = {
            'mean': mean,
            'std': scale,
        }
        
with open(scaler_filepath, 'w') as fout:
    json.dump(scaler_dict, fout)

In [None]:
# apply scaling to all rows and columns in train and test data 
for col, col_params in scaler_dict.items():
    X_train.loc[:, col] = (X_train.loc[:, col] - col_params['mean'])/col_params['std']
    X_test.loc[:, col] = (X_test.loc[:, col] - col_params['mean'])/col_params['std']

In [None]:
round(X_train['Age'].mean(), 2)

In [None]:
round(X_train['Age'].std(), 2)

In [None]:
round(X_test['Age'].mean(), 2)

In [None]:
round(X_test['Age'].std(), 2)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 

clf = LogisticRegression(random_state = 0).fit(X_train, y_train)
logistic_score = clf.score(X_train, y_train)
logistic_score

In [None]:
clf.predict(X_test)[:5]

In [None]:
clf.predict_proba(X_test)[:5]

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
predictions = (clf.predict_proba(X_test)[:, 1] >= .5).astype(int)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=clf.classes_,
)


disp.plot()
plt.show()

In [None]:
model_scores = {}
model_scores['logistic regression']= {'score': logistic_score}
model_scores
joblib.dump(clf, 'log_model.joblib')

# Decision Tree 

In [None]:
from sklearn import tree 
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
decision_tree = clf.score(X_test, y_test)

In [None]:
model_scores['Decision Tree'] = {'score': decision_tree}

In [None]:
from sklearn.model_selection import GridSearchCV

### Decision Tree Parameters
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
# Find the parameters that provide the best score for a decision tree
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    # randomly choose to iterate using odd numbers 
    'max_depth': [3,5,7,9,11],
}

clf = GridSearchCV(tree.DecisionTreeClassifier(random_state = 0), params, error_score = 0)
search = clf.fit(X_train, y_train)
decision_tree_best_params = search.best_params_
decision_tree_best_params

In [None]:
clf = tree.DecisionTreeClassifier(random_state = 0, **decision_tree_best_params)
clf = clf.fit(X_train, y_train)
decision_tree_gridsearch = clf.score(X_test, y_test)
predictions = (clf.predict_proba(X_test)[:, 1] >= .25).astype(int)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=clf.classes_,
)
disp.plot()
plt.show()

In [None]:
model_scores['Decision Tree GridSearch'] = {'score': decision_tree_gridsearch, 'best parameters': decision_tree_best_params}

In [None]:
joblib.dump(clf, 'tree_model.joblib')
model_scores

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
with open('model_scores.json') as json_file:
    model_scores = json.load(json_file)

### Random Foest Parameters 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
model_scores

In [None]:
# params = {
#     'n_estimators' : [10, 50, 100, 150],
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [3,5,7,9,11],
#     'max_features': ['auto', 'sqrt', 'log2'],
# }

# clf = GridSearchCV(RandomForestClassifier(random_state = 0), params, error_score = 0)
# search = clf.fit(X_train, y_train)
# best_params = search.best_params_
# best_params

In [None]:
random_forest_best_params = model_scores['Random Forest GridSearch']['best parameters']

In [None]:
rf_clf = RandomForestClassifier(random_state = 0, **random_forest_best_params)
rf_clf = rf_clf.fit(X_train, y_train)
random_forest_gridsearch = rf_clf.score(X_test, y_test)
predictions = (clf.predict_proba(X_test)[:, 1] >= .2).astype(int)
cm = confusion_matrix(y_test, predictions, labels=rf_clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=rf_clf.classes_,
)
disp.plot()
plt.show()

In [None]:
display(random_forest_gridsearch)
display(random_forest_best_params)
model_scores['Random Forest GridSearch'] = {'score': random_forest_gridsearch, 'best parameters': random_forest_best_params}
joblib.dump(rf_clf, 'rf_model.joblib')

# AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier 

params = {
    'n_estimators': [10, 50, 100, 150, 200],
    'learning_rate': [.1, .15, .20, 0.25, 0.5, .75, 1.0, 2.0],
}    


clf = GridSearchCV(AdaBoostClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
print(type(search))
best_params = search.best_params_ 

In [None]:
adaboost_params = best_params

In [None]:
clf = AdaBoostClassifier(random_state = 0, **adaboost_params)
clf = clf.fit(X_train, y_train)
adaboost_gridsearch = clf.score(X_test, y_test)

display(adaboost_gridsearch)
display(adaboost_params)
model_scores['Ada Boost GridSearch'] = {'score': adaboost_gridsearch, 'best parameters': adaboost_params}
joblib.dump(clf, 'ada_model.joblib')

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# Fit Random Forest 
rf_clf = RandomForestClassifier(random_state = 0, **random_forest_best_params)
rf_clf = rf_clf.fit(X_train, y_train)
random_forest_gridsearch = rf_clf.score(X_test, y_test)
print(f'Fitting Random Forest Classifier:', {random_forest_gridsearch})

# Fit Ada Boost 
ada_clf = AdaBoostClassifier(random_state = 0, **adaboost_params)
ada_clf = ada_clf.fit(X_train, y_train)
adaboost_gridsearch = ada_clf.score(X_test, y_test)
print(f'Fitting AdaBoost Classifier:', {adaboost_gridsearch})

params = {
    'voting': ['hard', 'soft'],
    'weights': [[.2,.8], [0.25, 0.75], [0.5, 0.5], [0.75, 0.25], [1,2],[2,1]],
}    

eclf = VotingClassifier(
    estimators=[('rf', rf_clf), ('ada', ada_clf)], 
)

vc_clf = GridSearchCV(eclf, params, error_score=0)
search = vc_clf.fit(X_train, y_train)
vc_params = search.best_params_ 



eclf = VotingClassifier(
    estimators=[('rf', rf_clf), ('ada', ada_clf)],
    **vc_params,
)


eclf = eclf.fit(X_train, y_train)
vc_gridsearch = eclf.score(X_test, y_test)
model_scores['Voting Classifier'] = {'score': vc_gridsearch, 'best parameters' : vc_params}
joblib.dump(vc_clf, 'vote_model.joblib')

In [None]:
print(vc_params)
print(f'Fitting Voting Classifier:', {vc_gridsearch})

### Save model scores dictionary to json file for ease of access 

In [None]:
model_scores_filepath = pathlib.Path('model_scores.json')
        
with open(model_scores_filepath, 'w') as fout:
    json.dump(model_scores, fout)

In [None]:
model_scores

In [7]:
new_data = pd.read_csv('train.csv')

ckey = 0
raw_payload = new_data.loc[ckey].to_dict()
target = raw_payload.pop('Exited')

raw_payload

{'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67}

In [8]:
target

0

In [9]:
with open('dummy_column_mapper.json') as fin:
    dummy_column_mapper = json.load(fin)
    
with open('scaler_info.json') as fin:
    scaler_info = json.load(fin)
    
with open('col_order.json') as fin:
    col_order = json.load(fin)
    
payload = dict(raw_payload)

In [10]:
payload

{'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67}

In [11]:
import requests

In [14]:
base_endpoint = 'https://bcbazuredeploydemo.azurewebsites.net'

In [17]:
r = requests.get(base_endpoint)
r.text

'App is Healthy'

In [12]:
predict_endpoint = 'https://bcbazuredeploydemo.azurewebsites.net/predict'

In [18]:
r = requests.post(predict_endpoint, json=payload)
int(r.text)

TypeError: Object of type int64 is not JSON serializable

In [None]:
raw_payload