In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import classification_report 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from cancer_estimator_model import datasets

# Carregar o dataset 
df = datasets.get_integrated_dataset()
display(df.head())

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,...,NONE_EXPERIENCING,GENDER_TRANSGENDER,SEVERITY_MILD,SEVERITY_MODERATE,SEVERITY_NONE,SEVERITY_SEVERE,CONTACT_DONT_KNOW,CONTACT_NO,CONTACT_YES,COUNTRY
0,69.0,0,1.0,1.0,0.0,0.0,1,0.0,1.0,1.0,...,,,,,,,,,,
1,74.0,1,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0,...,,,,,,,,,,
2,59.0,0,0.0,0.0,1.0,0.0,1,0.0,1.0,0.0,...,,,,,,,,,,
3,63.0,1,1.0,1.0,0.0,0.0,0,0.0,0.0,1.0,...,,,,,,,,,,
4,63.0,0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,...,,,,,,,,,,


In [2]:
# Dividir o conjunto de dados em features (X) e variável alvo (y)
X = df.drop(columns=['LUNG_CANCER','COUNTRY'])
y = df['LUNG_CANCER']

# Dividir o conjunto de dados em train and test
X_train = X[X.source != 3]  # dropa os dados que tem a source 3 (dataset covid)
X_test = X[X.source == 3]   # coloca somente os dados com a source 3 no teste (dataset covid)
y_train = y[X.source != 3]  # dropa os dados que tem a source 3 (dataset covid) na target
y_test = y[X.source == 3]   # coloca somente os dados com a source 3 no teste (dataset covid) na target

# Definir colunas categóricas
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Criar o pré-processador para codificar variáveis categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Aplicar pré-processamento aos dados
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Criar o modelo CatBoostClassifier
class_weights = [50, 50]
model = CatBoostClassifier(iterations=5, depth=3, learning_rate=0.1, loss_function='Logloss', class_weights=class_weights)

# Definir o número de dobras para a validação cruzada
n_folds = 4

# Criar o objeto de validação cruzada
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Realizar a validação cruzada
scores = cross_val_score(model, X_train_encoded, y_train, cv=kf, scoring="f1")

# Exibir as pontuações de validação cruzada
print("Pontuações de validação cruzada:", scores)

# Exibir a média das pontuações de validação cruzada
print("Média das pontuações de validação cruzada:", np.mean(scores))

# Ajustar o modelo aos dados de treino completos
model.fit(X_train_encoded, y_train)

# Fazer a predição no conjunto de dados de teste
y_pred = model.predict(X_test_encoded)

# Exibir as métricas de validação do modelo (Teria que colocar o P@k aqui)
print(classification_report(y_test, y_pred))

# Matriz de confusão
cf_matrix = confusion_matrix(y_test, y_pred)
cf_matrix

# Criando o heatmap
sns.heatmap(cf_matrix, cmap='coolwarm', annot=True, linewidth=1, fmt='d')
plt.show()

# Salvar as predições em um arquivo CSV
output = pd.DataFrame({'Tem Cancer': y_pred})
output.to_csv('Output_cancer.csv', index=False)
print("Your output file was successfully saved!")

ValueError: 
All the 4 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2383, in _catboost.get_float_feature
  File "_catboost.pyx", line 1188, in _catboost._FloatOrNan
  File "_catboost.pyx", line 983, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'P1'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 5220, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2385, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2265, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1503, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 848, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1481, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "_catboost.pyx", line 4159, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4209, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4025, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 2963, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2427, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2385, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=231,feature_idx=28]="P1": Cannot convert 'b'P1'' to float

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2383, in _catboost.get_float_feature
  File "_catboost.pyx", line 1188, in _catboost._FloatOrNan
  File "_catboost.pyx", line 983, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'P10'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 5220, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2385, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2265, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1503, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 848, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1481, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "_catboost.pyx", line 4159, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4209, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4025, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 2963, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2427, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2385, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=226,feature_idx=28]="P10": Cannot convert 'b'P10'' to float

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2383, in _catboost.get_float_feature
  File "_catboost.pyx", line 1188, in _catboost._FloatOrNan
  File "_catboost.pyx", line 983, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'P1'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 5220, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2385, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2265, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1503, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 848, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1481, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "_catboost.pyx", line 4159, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4209, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4025, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 2963, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2427, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2385, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=249,feature_idx=28]="P1": Cannot convert 'b'P1'' to float

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2383, in _catboost.get_float_feature
  File "_catboost.pyx", line 1188, in _catboost._FloatOrNan
  File "_catboost.pyx", line 983, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b'P1'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 5220, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2385, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 2265, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1503, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 848, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "C:\Users\oscar\PycharmProjects\PO235\model\.venv\Lib\site-packages\catboost\core.py", line 1481, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "_catboost.pyx", line 4159, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4209, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4025, in _catboost._PoolBase._init_features_order_layout_pool
  File "_catboost.pyx", line 2963, in _catboost._set_features_order_data_pd_data_frame
  File "_catboost.pyx", line 2427, in _catboost.create_num_factor_data
  File "_catboost.pyx", line 2385, in _catboost.get_float_feature
_catboost.CatBoostError: Bad value for num_feature[non_default_doc_idx=221,feature_idx=28]="P1": Cannot convert 'b'P1'' to float
