## First, testing our Azure endpoint

In [1]:
import requests

In [2]:
test_data = {
    'age': 45.0,
    'sex': 1.0,
    'chest_pain': 4.0,
    'blood_pressure': 115.0,
    'serum_cholestoral': 260.0,
    'fasting_blood_sugar': 0.0,
    'electrocardiographic': 2.0,
    'max_heart_rate': 185.0,
    'induced_angina': 0.0,
    'ST_depression': 0.0,
    'slope': 1.0,
    'vessels': 0.0,
    'thal': 3.0,
}

In [3]:
base_url = 'https://mytestmlappnkf.azurewebsites.net'
r = requests.get(base_url)
r.text

'App is Healthy'

In [4]:
neural_url = 'https://mytestmlappnkf.azurewebsites.net/neural'
r = requests.post(neural_url, json = test_data)
r.json()

0

## Data for Mini-Project

## Step 1:

In [5]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
pd.options.display.float_format = '{:,.4f}'.format
import seaborn as sns
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)

from bokeh.layouts import gridplot, column
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource, 
                          HoverTool, LabelSet, LinearColorMapper, NumeralTickFormatter)
from bokeh.palettes import brewer, RdBu, Reds
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import transform

%config Completer.use_jedi = False
output_notebook()

https://www.kaggle.com/adammaus/predicting-churn-for-bank-customers?select=Churn_Modelling.csv

In [6]:
data_path = pathlib.Path('data', 'train.csv')
data = pd.read_csv(data_path)

In [7]:
for col in data.columns:
        if data[col].nunique() < 10:
                print(f' {col}: {data[col].unique()}')

 Geography: ['Germany' 'France' 'Spain']
 Gender: ['Female' 'Male']
 NumOfProducts: [1 2 3 4]
 HasCrCard: [1 0]
 IsActiveMember: [1 0]
 Exited: [0 1]


In [8]:
cols_to_consider = ['Geography', 'Gender']
for col in cols_to_consider:
    data= pd.concat([data,pd.get_dummies(data[col])], axis = 1)
    data =  data.drop(columns= [col])
    

Does it matter where Exited is in the database?

In [9]:
target = 'Exited'
feature = [col for col in data.columns if col != target]
data.loc[0, feature].to_dict()

{'CreditScore': 597.0,
 'Age': 35.0,
 'Tenure': 8.0,
 'Balance': 131101.04,
 'NumOfProducts': 1.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 1.0,
 'EstimatedSalary': 192852.67,
 'France': 0.0,
 'Germany': 1.0,
 'Spain': 0.0,
 'Female': 1.0,
 'Male': 0.0}

## Step 2:

In [10]:
import json
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit(data[feature])
scaled_data = scaler.fit_transform(data[feature])
scaled_data = pd.DataFrame(scaled_data, columns = feature)
scaled_data[target] = data[target]

scaler_means = {key: val for key, val in zip(feature, scaler.mean_)}
scaler_sigmas = {key: val for key, val in zip(feature, scaler.scale_)}

with open('scaler_means.json', 'w') as fout: 
    json.dump(scaler_means, fout)
    
with open('scaler_sigmas.json', 'w') as fout: 
    json.dump(scaler_sigmas, fout)

In [11]:
def plot_confusion_matrix(y_true, y_predicted):
    
    from sklearn import metrics
    
    accuracy = np.round(100*(y_true == y_predicted).astype(int).sum()/len(y_predicted), 2)
    
    confusion = pd.DataFrame(metrics.confusion_matrix(y_true, y_predicted))
    confusion.index.name = "True"
    confusion.columns.name = "Predicted"
    confusion = confusion.stack().rename("value").reset_index()
    confusion['True'] = confusion['True'].astype(str)
    confusion['Predicted'] = confusion['Predicted'].astype(str)

    source = ColumnDataSource(confusion)

    values = sorted(list(confusion['True'].unique()))

    palette = brewer['RdBu'][10]
    color_mapper = LinearColorMapper(
        palette = palette, 
    )

    p = figure(
        plot_width = 300, 
        plot_height = 300, 
        title = f'Confusion Matrix: Overall accuracy = {accuracy}%',
        x_range = ['0', '1'], 
        y_range = ['0', '1'],
        x_axis_label = 'Predicted',
        y_axis_label = 'True',
        tools = 'hover', 
        x_axis_location="below",
    )

    p.rect(
        x = 'Predicted', 
        y = 'True', 
        width = 1, 
        height = 1, 
        source = source,
        line_color = 'grey', 
        fill_color = transform('value', color_mapper),
    )

    hover = p.hover.tooltips = [
        ("True", "@{True}"),
        ("Predicted", "@{Predicted}"),
        ("Count", "@value"),
    ]

    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "14px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    
    labels = LabelSet(x='Predicted', y='True', text='value',
                      render_mode='canvas', text_color = 'white',
                      x_offset = 50, y_offset = 50, source=source,)

    p.add_layout(labels)

    show(p)

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 0)
x_train, y_train = train[feature], train[target]
x_test, y_test = test[feature], test[target]

# Possible Models:

## Linear Regression:

In [13]:
model_dic= {}

In [14]:
import statsmodels.formula.api as smf

formula = f"{target} ~ {' + '.join(feature)}"

model = smf.ols(
    formula = formula, 
    data = train)

fit_model = model.fit()

#fit_model.summary()


predictions = fit_model.predict(test[feature])

predictions =(predictions > 0.5).astype(int)
 
plot_confusion_matrix(y_test.values, predictions.values)

In [15]:
score = fit_model.rsquared

model_dic['Linear Regression']= score

## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver = 'newton-cg')

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)


In [17]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [18]:
model_dic['Logistic Regression']= score

## Decision Tree

In [19]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [20]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [21]:
model_dic['Decision Tree']= score

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [23]:
y_hat = clf.predict(x_test)
plot_confusion_matrix(y_test, y_hat)

In [24]:
model_dic['Random Forest']= score

## Gradient Forest

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [26]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [27]:
model_dic['Gradient Forest']= score

## AdaBoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [29]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [30]:
model_dic['Adaboost']= score

## Choosing the best model:


In [31]:
model_dic

{'Linear Regression': 0.15009944359133554,
 'Logistic Regression': 0.8110709987966306,
 'Decision Tree': 0.7922182109907742,
 'Random Forest': 0.8584035298836743,
 'Gradient Forest': 0.8592057761732852,
 'Adaboost': 0.8479743281187324}

In [32]:
accuracy = 0
model = ""
for key, value in model_dic.items():
    if value > accuracy:
        model = key
        accuracy = value
        
print(model, accuracy)     

Gradient Forest 0.8592057761732852


## Step 3:

# How to fit model with other than adaboost?
# Do i Need to change the app file?


In [33]:
import pickle
from sklearn.ensemble import AdaBoostClassifier

adaboost_params = {
    'learning_rate': 0.01, 
    'n_estimators': 100,
}

clf = AdaBoostClassifier(random_state = 0, **adaboost_params)
clf = clf.fit(x_train, y_train)
with open('adaboost.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
    


# Requests to Endpoints

In [35]:
import requests

adaboost_url = 'http://127.0.0.1:5000/adaboost'

In [36]:
index = 100
data_dict = data.loc[index, feature].to_dict()
data_dict

{'CreditScore': 639.0,
 'Age': 22.0,
 'Tenure': 4.0,
 'Balance': 0.0,
 'NumOfProducts': 2.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 0.0,
 'EstimatedSalary': 28188.96,
 'France': 1.0,
 'Germany': 0.0,
 'Spain': 0.0,
 'Female': 0.0,
 'Male': 1.0}

In [37]:
response = requests.post(neural_url, json = data_dict)
nn_prediction = response.json()

response = requests.post(adaboost_url, json = data_dict)
ab_prediction = response.json()

print(f'{index}: NN -> {nn_prediction}, Ada -> {ab_prediction}')  

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /neural (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E581CAF910>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))