In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')

In [3]:
# Ler conjunto de dados
df = (
    pd
    .read_csv(data_path)
    .rename(columns={
        'FATIGUE ': 'FATIGUE',
        'ALLERGY ': 'ALLERGY'}
    ) #Para manter a consistência do nome das colunas no dicionário de dados
)
# Ler dicionário de dados
df_dict = pd.read_csv(dict_path, sep=';')

### Preparação de dados

In [4]:
target_column = 'LUNG_CANCER'
discrete_columns = (
    df_dict
    .query('Tipo == "Discreta"')
    .Variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('Tipo == "Nominal" and Variavel != @target_column')
    .Variavel
    .to_list()
)

In [5]:
discrete_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', StandardScaler()) # Normalização
])


nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    ('normalization', StandardScaler(with_mean=False)), # Normalização    
])

In [6]:
preprocessing = ColumnTransformer([
    ("discrete", discrete_preprocessor, discrete_columns),
    ("nominal", nominal_preprocessor, nominal_columns)
])
model = LinearSVC()
approach = Pipeline([
    ("preprocessing", preprocessing),
    ("model", model)
])

In [7]:
X = df.drop(columns=[target_column], axis=1)
y = df[[target_column]]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=42)

In [8]:
approach.fit(X_train, y_train)

In [9]:
y_test_hat = approach.predict(X_test)

In [10]:
acc = accuracy_score(y_test.LUNG_CANCER, y_test_hat)
print(f"Acurácia: {acc:.4f}")

Acurácia: 0.9839
