# Análise comparativa de modelos

In [90]:
# import libraries

from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Obtenção de dados

In [91]:
df = pd.read_csv("../data/raw/Churn_Modelling.csv")
dictionary = pd.read_csv("../data/external/dictionary.csv")
df, dictionary

(      RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
 0             1    15634602   Hargrave          619    France  Female   42   
 1             2    15647311       Hill          608     Spain  Female   41   
 2             3    15619304       Onio          502    France  Female   42   
 3             4    15701354       Boni          699    France  Female   39   
 4             5    15737888   Mitchell          850     Spain  Female   43   
 ...         ...         ...        ...          ...       ...     ...  ...   
 9995       9996    15606229   Obijiaku          771    France    Male   39   
 9996       9997    15569892  Johnstone          516    France    Male   35   
 9997       9998    15584532        Liu          709    France  Female   36   
 9998       9999    15682355  Sabbatini          772   Germany    Male   42   
 9999      10000    15628319     Walker          792    France  Female   28   
 
       Tenure    Balance  NumOfProducts  HasCrCard

## 2. Preparação de dados

In [92]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
dictionary = dictionary.drop(index=0)
df.isnull().sum().sum()
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [93]:
target_column = 'Exited'

nominal_columns = (
    dictionary
    .query("variavel in ['Geography', 'Gender'] and variavel != @target_column")
    .variavel
    .to_list()
)

rest_columns = (
    dictionary
    .query("variavel != ['Geography', 'Gender', @target_column]")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [100]:
x = df[rest_columns].values
x_mean = x.mean()
x_std = x.std()
z_score = np.abs((x - x_mean) / x_std)
pd.DataFrame(z_score)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.440369,0.452166,0.452983,0.453024,0.453004,0.453004,0.453004,1.618963
1,0.440594,0.452186,0.453004,1.260352,0.453004,0.453024,0.453004,1.847808
2,0.442761,0.452166,0.452861,2.811098,0.452963,0.453004,0.453024,1.876205
3,0.438734,0.452227,0.453004,0.453024,0.452983,0.453024,0.453024,1.465177
4,0.435647,0.452145,0.452983,2.112932,0.453004,0.453004,0.453004,1.163779
...,...,...,...,...,...,...,...,...
9995,0.437262,0.452227,0.452922,0.453024,0.452983,0.453004,0.453024,1.515143
9996,0.442475,0.452309,0.452820,0.719846,0.453004,0.453004,0.453004,1.626136
9997,0.438529,0.452288,0.452881,0.453024,0.453004,0.453024,0.453004,0.407378
9998,0.437241,0.452166,0.452963,1.081823,0.452983,0.453004,0.453024,1.445998


In [94]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    ('encoding', OneHotEncoder(sparse_output=False)), 
    ('normalization', StandardScaler()) 
])
rest_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), 
    ('normalization', StandardScaler()) 
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('rest', rest_preprocessor, rest_columns)
])

model = LogisticRegression()

In [95]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [96]:
X = preprocessor.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.0,1.0,0.0,619.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88
1,0.0,0.0,1.0,1.0,0.0,608.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58
2,1.0,0.0,0.0,1.0,0.0,502.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57
3,1.0,0.0,0.0,1.0,0.0,699.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63
4,0.0,0.0,1.0,1.0,0.0,850.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,0.0,1.0,771.0,39.0,5.0,0.00,2.0,1.0,0.0,96270.64
9996,1.0,0.0,0.0,0.0,1.0,516.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.77
9997,1.0,0.0,0.0,1.0,0.0,709.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.58
9998,0.0,1.0,0.0,0.0,1.0,772.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.52


## 3. Seleção de modelos

### 3.1 Resultados gerais

### 3.2 Persistência do modelo