## Data for Mini-Project

## Step 1:

In [1]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
pd.options.display.float_format = '{:,.4f}'.format
import seaborn as sns
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)

from bokeh.layouts import gridplot, column
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource, 
                          HoverTool, LabelSet, LinearColorMapper, NumeralTickFormatter)
from bokeh.palettes import brewer, RdBu, Reds
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import transform

%config Completer.use_jedi = False
output_notebook()

https://www.kaggle.com/adammaus/predicting-churn-for-bank-customers?select=Churn_Modelling.csv

In [2]:
data_path = pathlib.Path('data', 'train.csv')
data = pd.read_csv(data_path)

In [3]:
for col in data.columns:
        if data[col].nunique() < 10:
                print(f' {col}: {data[col].unique()}')

 Geography: ['Germany' 'France' 'Spain']
 Gender: ['Female' 'Male']
 NumOfProducts: [1 2 3 4]
 HasCrCard: [1 0]
 IsActiveMember: [1 0]
 Exited: [0 1]


In [4]:
cols_to_consider = ['Geography', 'Gender']
for col in cols_to_consider:
    data= pd.concat([data,pd.get_dummies(data[col])], axis = 1)
    data =  data.drop(columns= [col])
    

Does it matter where Exited is in the database?

In [5]:
target = 'Exited'
feature = [col for col in data.columns if col != target]
data.loc[0, feature].to_dict().keys()

dict_keys(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'France', 'Germany', 'Spain', 'Female', 'Male'])

In [6]:
data.keys()

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'France', 'Germany',
       'Spain', 'Female', 'Male'],
      dtype='object')

## Step 2:

In [7]:
import json
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit(data[feature])
scaled_data = scaler.fit_transform(data[feature])
scaled_data = pd.DataFrame(scaled_data, columns = feature)
scaled_data[target] = data[target]

scaler_means = {key: val for key, val in zip(feature, scaler.mean_)}
scaler_sigmas = {key: val for key, val in zip(feature, scaler.scale_)}

with open('scaler_means.json', 'w') as fout: 
    json.dump(scaler_means, fout)
    
with open('scaler_sigmas.json', 'w') as fout: 
    json.dump(scaler_sigmas, fout)

In [8]:
def plot_confusion_matrix(y_true, y_predicted):
    
    from sklearn import metrics
    
    accuracy = np.round(100*(y_true == y_predicted).astype(int).sum()/len(y_predicted), 2)
    
    confusion = pd.DataFrame(metrics.confusion_matrix(y_true, y_predicted))
    confusion.index.name = "True"
    confusion.columns.name = "Predicted"
    confusion = confusion.stack().rename("value").reset_index()
    confusion['True'] = confusion['True'].astype(str)
    confusion['Predicted'] = confusion['Predicted'].astype(str)

    source = ColumnDataSource(confusion)

    values = sorted(list(confusion['True'].unique()))

    palette = brewer['RdBu'][10]
    color_mapper = LinearColorMapper(
        palette = palette, 
    )

    p = figure(
        plot_width = 300, 
        plot_height = 300, 
        title = f'Confusion Matrix: Overall accuracy = {accuracy}%',
        x_range = ['0', '1'], 
        y_range = ['0', '1'],
        x_axis_label = 'Predicted',
        y_axis_label = 'True',
        tools = 'hover', 
        x_axis_location="below",
    )

    p.rect(
        x = 'Predicted', 
        y = 'True', 
        width = 1, 
        height = 1, 
        source = source,
        line_color = 'grey', 
        fill_color = transform('value', color_mapper),
    )

    hover = p.hover.tooltips = [
        ("True", "@{True}"),
        ("Predicted", "@{Predicted}"),
        ("Count", "@value"),
    ]

    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "14px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    
    labels = LabelSet(x='Predicted', y='True', text='value',
                      render_mode='canvas', text_color = 'white',
                      x_offset = 50, y_offset = 50, source=source,)

    p.add_layout(labels)

    show(p)

In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 0)
x_train, y_train = train[feature], train[target]
x_test, y_test = test[feature], test[target]

In [52]:
payload = x_test.iloc[10].to_dict()

In [53]:
import requests

In [54]:
url = 'http://127.0.0.1:5000/'

In [55]:
requests.get(url).content

b'App is Healthy'

In [56]:
url = 'http://127.0.0.1:5000/gradient'

In [57]:
r = requests.post(url, json = payload)

In [58]:
r.content

b'0\n'

# Possible Models:

## Linear Regression:

In [None]:
model_dic= {}

In [None]:
import statsmodels.formula.api as smf

formula = f"{target} ~ {' + '.join(feature)}"

model = smf.ols(
    formula = formula, 
    data = train)

fit_model = model.fit()

#fit_model.summary()


predictions = fit_model.predict(test[feature])

predictions =(predictions > 0.5).astype(int)
 
plot_confusion_matrix(y_test.values, predictions.values)

In [None]:
score = fit_model.rsquared

model_dic['Linear Regression']= score

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver = 'newton-cg')

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)


In [None]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [None]:
model_dic['Logistic Regression']= score

## Decision Tree

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [None]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [None]:
model_dic['Decision Tree']= score

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [None]:
y_hat = clf.predict(x_test)
plot_confusion_matrix(y_test, y_hat)

In [None]:
model_dic['Random Forest']= score

## Gradient Forest

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [None]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [None]:
model_dic['Gradient Forest']= score

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

In [None]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [None]:
model_dic['Adaboost']= score

## Choosing the best model:


In [None]:
model_dic

In [None]:
accuracy = 0
model = ""
for key, value in model_dic.items():
    if value > accuracy:
        model = key
        accuracy = value
        
print(model, accuracy)     

## Step 3:

# How to fit model with other than adaboost?
# Do i Need to change the app file?


In [None]:
import pickle
from sklearn.ensemble import GradientBoostingClassifier


gradient_params = {
    'learning_rate': 0.01, 
    'n_estimators': 100,
}

clf = GradientBoostingClassifier(random_state = 0, **gradient_params)
clf = clf.fit(x_train, y_train)
with open('gradient.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Requests to Endpoints

In [None]:
import requests

gradient_url = 'http://127.0.0.1:5000/gradient'

In [None]:
index = 100
data_dict = data.loc[index, feature].to_dict()
data_dict

In [None]:
response = requests.post(gradient_url, json = data_dict)
g_prediction = response.json()

print(f'{index}: Gradient -> {g_prediction}')  

In [None]:
response