In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
# Ler conjunto de dados
df = (
    pd
    .read_csv(data_path)
    .rename(columns={
        'FATIGUE ': 'FATIGUE',
        'ALLERGY ': 'ALLERGY'}
    ) #Para manter a consistência do nome das colunas no dicionário de dados
)
# Ler dicionário de dados
df_dict = pd.read_csv(dict_path, sep=';')

### Preparação de dados

In [4]:
target_column = 'LUNG_CANCER'
discrete_columns = (
    df_dict
    .query('Tipo == "Discreta"')
    .Variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('Tipo == "Nominal" and Variavel != @target_column')
    .Variavel
    .to_list()
)

In [5]:
discrete_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', StandardScaler()) # Normalização
])


nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    ('normalization', StandardScaler(with_mean=False)), # Normalização    
])

In [6]:
preprocessing = ColumnTransformer([
    ("discrete", discrete_preprocessor, discrete_columns),
    ("nominal", nominal_preprocessor, nominal_columns)
])

models = [
    (
        "logistic regression",
        LogisticRegression(solver='saga'),
        {"penalty": ['l1', 'l2', 'elasticnet']}
    ),
    (
        "k nearest neighbors",
        KNeighborsClassifier(),
        {"n_neighbors": [1, 3, 5, 11, 15]}
    ),
    (
        "support vector machines",
        SVC(),
        {"kernel": ['linear', 'rbf']}
    ),
    (
        "naive bayes",
        BernoulliNB(),
        {"alpha": [0, 0.5, 1]}
    ),
]

In [7]:
X = df.drop(columns=[target_column], axis=1)
y = df[[target_column]].replace({"YES": 1, "NO": 0})

cv = ShuffleSplit(n_splits=30, train_size=0.8, random_state=42)

In [8]:
results = {}
for model_name, model, model_params in models:
    print(f'{model_name} run...')
    
    model_gs = GridSearchCV(model, model_params, scoring='accuracy')
    approach = Pipeline([
        ("preprocessing", preprocessing),
        ("model", model_gs)
    ])
    model_results = cross_validate(
        approach,
        X=X,
        y=y,
        scoring=['accuracy', 'f1', 'precision', 'recall'],
        cv=cv,
        n_jobs=-1,
        return_train_score=False
    )
    model_results['name'] = [model_name] * len(model_results['score_time'])
    if results:
        for key, value in model_results.items():
            results[key] = np.append(results[key], value)
    else:
        results = model_results

logistic regression run...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/madson/Workspace/atlantico/projects/lung-cancer/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/madson/Workspace/atlantico/pr

k nearest neighbors run...


5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/madson/Workspace/atlantico/projects/lung-cancer/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/madson/Workspace/atlantico/projects/lung-cancer/.venv/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1101, in fit
    raise ValueError(
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  

support vector machines run...


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return self._fit(X, y)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  ret

naive bayes run...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [9]:
df_results = pd.DataFrame(results)
df_results.groupby('name').agg([np.mean, np.std])

Unnamed: 0_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_f1,test_f1,test_precision,test_precision,test_recall,test_recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
k nearest neighbors,0.51599,0.139107,0.049454,0.020764,0.893011,0.034537,0.939183,0.020471,0.933933,0.038565,0.946236,0.031232
logistic regression,0.37152,0.077785,0.029781,0.014099,0.93172,0.029554,0.961368,0.017161,0.951669,0.029022,0.971883,0.017775
naive bayes,0.240818,0.114676,0.049423,0.031134,0.915054,0.032791,0.950751,0.019633,0.965321,0.033518,0.937496,0.023231
support vector machines,0.191691,0.063565,0.037278,0.019078,0.93172,0.032723,0.961293,0.019056,0.952825,0.03137,0.970625,0.019866
