In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import classification_report 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from cancer_estimator_model import datasets

# Carregar o dataset 
df = datasets.get_integrated_dataset()
display(df.head())

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,...,NONE_EXPERIENCING,GENDER_TRANSGENDER,SEVERITY_MILD,SEVERITY_MODERATE,SEVERITY_NONE,SEVERITY_SEVERE,CONTACT_DONT_KNOW,CONTACT_NO,CONTACT_YES,COUNTRY
0,69.0,0,1.0,1.0,0.0,0.0,1,0.0,1.0,1.0,...,,,,,,,,,,
1,74.0,1,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0,...,,,,,,,,,,
2,59.0,0,0.0,0.0,1.0,0.0,1,0.0,1.0,0.0,...,,,,,,,,,,
3,63.0,1,1.0,1.0,0.0,0.0,0,0.0,0.0,1.0,...,,,,,,,,,,
4,63.0,0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,...,,,,,,,,,,


In [2]:
# Dividir o conjunto de dados em features (X) e variável alvo (y)
X = df.drop(columns=['LUNG_CANCER','COUNTRY'])
y = df['LUNG_CANCER']

# Dividir o conjunto de dados em train and test
X_train = X[X.source != 3]  # dropa os dados que tem a source 3 (dataset covid)
X_test = X[X.source == 3]   # coloca somente os dados com a source 3 no teste (dataset covid)
y_train = y[X.source != 3]  # dropa os dados que tem a source 3 (dataset covid) na target
y_test = y[X.source == 3]   # coloca somente os dados com a source 3 no teste (dataset covid) na target

# Definir colunas categóricas
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Criar o pré-processador para codificar variáveis categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Aplicar pré-processamento aos dados
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Criar o modelo CatBoostClassifier
class_weights = [50, 50]
model = CatBoostClassifier(iterations=5, depth=3, learning_rate=0.1, loss_function='Logloss', class_weights=class_weights)

# Definir o número de dobras para a validação cruzada
n_folds = 4

# Criar o objeto de validação cruzada
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Realizar a validação cruzada
scores = cross_val_score(model, X_train_encoded, y_train, cv=kf, scoring="f1")

# Exibir as pontuações de validação cruzada
print("Pontuações de validação cruzada:", scores)

# Exibir a média das pontuações de validação cruzada
print("Média das pontuações de validação cruzada:", np.mean(scores))

# Ajustar o modelo aos dados de treino completos
model.fit(X_train_encoded, y_train)

# Fazer a predição no conjunto de dados de teste
y_pred = model.predict(X_test_encoded)

# Exibir as métricas de validação do modelo (Teria que colocar o P@k aqui)
#print(classification_report(y_test, y_pred))

# Salvar as predições em um arquivo CSV
output = pd.DataFrame({'Tem Cancer': y_pred})
output.to_csv('Output_cancer.csv', index=False)
print("Your output file was successfully saved!")

0:	learn: 0.6550161	total: 140ms	remaining: 558ms
1:	learn: 0.5511803	total: 142ms	remaining: 213ms
2:	learn: 0.3866317	total: 145ms	remaining: 96.4ms
3:	learn: 0.3357846	total: 147ms	remaining: 36.8ms
4:	learn: 0.3082935	total: 149ms	remaining: 0us
0:	learn: 0.5773423	total: 2.84ms	remaining: 11.4ms
1:	learn: 0.3908915	total: 5.28ms	remaining: 7.92ms
2:	learn: 0.3556332	total: 7.93ms	remaining: 5.29ms
3:	learn: 0.3452148	total: 10.4ms	remaining: 2.59ms
4:	learn: 0.2472841	total: 13.1ms	remaining: 0us
0:	learn: 0.5823380	total: 3.06ms	remaining: 12.2ms
1:	learn: 0.3983062	total: 5.45ms	remaining: 8.17ms
2:	learn: 0.3672573	total: 7.83ms	remaining: 5.22ms
3:	learn: 0.3588117	total: 10.5ms	remaining: 2.62ms
4:	learn: 0.3367007	total: 13.3ms	remaining: 0us
0:	learn: 0.6641338	total: 2.92ms	remaining: 11.7ms
1:	learn: 0.4668535	total: 5.88ms	remaining: 8.81ms
2:	learn: 0.3834639	total: 8.28ms	remaining: 5.52ms
3:	learn: 0.3248529	total: 10.4ms	remaining: 2.61ms
4:	learn: 0.2402242	total: 1