In [17]:
from pathlib import Path
from IPython.display import Markdown
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
# Ler conjunto de dados
df = (
    pd
    .read_csv(data_path)
    .rename(columns={
        'FATIGUE ': 'FATIGUE',
        'ALLERGY ': 'ALLERGY'}
    ) #Para manter a consistência do nome das colunas no dicionário de dados
)
# Ler dicionário de dados
df_dict = pd.read_csv(dict_path, sep=';')

### Preparação de dados

In [4]:
target_column = 'LUNG_CANCER'
discrete_columns = (
    df_dict
    .query('Tipo == "Discreta"')
    .Variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('Tipo == "Nominal" and Variavel != @target_column')
    .Variavel
    .to_list()
)

In [5]:
discrete_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])


nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False)), # Codificação de variáveis
    # Seleção de variáveis
    ('normalization', MinMaxScaler()), # Normalização    
])

In [6]:
preprocessing = ColumnTransformer([
    ("discrete", discrete_preprocessor, discrete_columns),
    ("nominal", nominal_preprocessor, nominal_columns)
])

models = [
    (
        "LR",
        LogisticRegression(solver='saga', max_iter=1000),
        {"penalty": ['none', 'l1', 'l2', 'elasticnet']}
    ),
    (
        "KNN",
        KNeighborsClassifier(metric='euclidean'),
        {"n_neighbors": [3, 5, 11, 15]}
    ),
    (
        "SVM",
        SVC(),
        {"kernel": ['linear', 'rbf']}
    ),
    (
        "NB",
        BernoulliNB(),
        {"alpha": [1e-3, 0.5, 1]}
    ),
]

In [7]:
X = df.drop(columns=[target_column], axis=1)
y = (
    df[[target_column]]
    .replace({"YES": 1, "NO": 0})
    .to_numpy()
    .ravel()
)

cv = ShuffleSplit(n_splits=30, train_size=0.8, random_state=42)

In [8]:
results = {}
for model_name, model, model_params in models:
    print(f'{model_name} run...')
    
    model_gs = GridSearchCV(model, model_params, scoring='accuracy')
    approach = Pipeline([
        ("preprocessing", preprocessing),
        ("model", model_gs)
    ])
    model_results = cross_validate(
        approach,
        X=X,
        y=y,
        scoring=['accuracy', 'f1', 'precision', 'recall'],
        cv=cv,
        n_jobs=-1,
        return_train_score=False
    )
    model_results['name'] = [model_name] * len(model_results['score_time'])
    if results:
        for key, value in model_results.items():
            results[key] = np.append(results[key], value)
    else:
        results = model_results

LR run...
KNN run...
SVM run...
NB run...


In [9]:
df_results = pd.DataFrame(results)
df_results.groupby('name').agg([np.mean, np.std])

Unnamed: 0_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_f1,test_f1,test_precision,test_precision,test_recall,test_recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
KNN,0.274828,0.114859,0.034625,0.016775,0.89086,0.034066,0.938755,0.019895,0.922402,0.038235,0.95726,0.026473
LR,0.754532,0.110262,0.018531,0.006576,0.92957,0.035052,0.96028,0.020059,0.948093,0.034621,0.973604,0.018576
NB,0.122782,0.023795,0.027958,0.008491,0.911828,0.033549,0.948971,0.020079,0.961677,0.033859,0.937496,0.023699
SVM,0.11643,0.027835,0.023365,0.006199,0.930108,0.032901,0.960376,0.018939,0.952913,0.033806,0.968783,0.018195


In [10]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(df_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,KNN,LR,NB,SVM
fit_time,0.275 ± 0.113,0.755 ± 0.108,0.123 ± 0.023,0.116 ± 0.027
score_time,0.035 ± 0.016,0.019 ± 0.006,0.028 ± 0.008,0.023 ± 0.006
test_accuracy,0.891 ± 0.033,0.930 ± 0.034,0.912 ± 0.033,0.930 ± 0.032
test_f1,0.939 ± 0.020,0.960 ± 0.020,0.949 ± 0.020,0.960 ± 0.019
test_precision,0.922 ± 0.038,0.948 ± 0.034,0.962 ± 0.033,0.953 ± 0.033
test_recall,0.957 ± 0.026,0.974 ± 0.018,0.937 ± 0.023,0.969 ± 0.018


O melhor modelo é o : **LR**

In [16]:
model_name, model, model_params  = [foo for foo in models if foo[0] == winner][0]
model_gs = GridSearchCV(model, model_params, scoring='accuracy')
approach = Pipeline([
    ("preprocessing", preprocessing),
    ("model", model_gs)
])
approach.fit(X, y)

In [19]:
joblib.dump(approach, '../models/model.joblib') 

['../models/model.joblib']