In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:

%pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
X = data.drop(['Status','id'], axis=1)
y = data['Status']

In [5]:
X['Drug'] = X['Drug'].fillna('NA')

X['Ascites'] = X['Ascites'].replace(np.nan, X['Ascites'].mode()[0])  # Replace 'NaN' string

X['Hepatomegaly']= X['Hepatomegaly'].replace( np.nan, X['Hepatomegaly'].mode()[0])  # Replace 'NaN' string

X['Spiders'] =X['Spiders'].replace(np.nan, X['Spiders'].mode()[0])  # Replace 'NaN' string

In [6]:
# prompt: please train a catboost model

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(iterations=1000,
                          learning_rate=0.05,
                          depth=6,
                          loss_function='MultiClass',
                          eval_metric='AUC',
                          random_seed=42,
                          verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=["Sex","Ascites","Hepatomegaly","Spiders","Drug","Edema"])


0:	test: 0.8005146	best: 0.8005146 (0)	total: 349ms	remaining: 5m 48s
100:	test: 0.8992156	best: 0.8992156 (100)	total: 4.31s	remaining: 38.4s
200:	test: 0.9061516	best: 0.9066927 (196)	total: 8.38s	remaining: 33.3s
300:	test: 0.9096323	best: 0.9099209 (289)	total: 12.3s	remaining: 28.6s
400:	test: 0.9117462	best: 0.9121951 (379)	total: 16.3s	remaining: 24.4s
500:	test: 0.9138427	best: 0.9146432 (489)	total: 20.3s	remaining: 20.2s
600:	test: 0.9140063	best: 0.9146967 (523)	total: 24.3s	remaining: 16.1s
700:	test: 0.9143562	best: 0.9146967 (523)	total: 28.4s	remaining: 12.1s
800:	test: 0.9140100	best: 0.9149651 (758)	total: 32.4s	remaining: 8.04s
900:	test: 0.9134417	best: 0.9149651 (758)	total: 36.4s	remaining: 4s
999:	test: 0.9132473	best: 0.9149651 (758)	total: 40.4s	remaining: 0us

bestTest = 0.9149650945
bestIteration = 758

Shrink model to first 759 iterations.


<catboost.core.CatBoostClassifier at 0x1c41d890b90>

In [7]:
# prompt: find logloss of the model using whole data X

from sklearn.metrics import log_loss

y_pred_proba = model.predict_proba(X)
logloss_score = log_loss(y, y_pred_proba)
print(f"Log Loss on the whole dataset: {logloss_score}")


Log Loss on the whole dataset: 0.3078127305058152


In [8]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.12 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
   ---------------------------------------- 0.0/386.6 kB ? eta -:--:--
   ------------------------------------- -- 358.4/386.6 kB 7.4 MB/s eta 0:00:01
   ---------------------------------------- 386.6/386.6 kB 6.0 MB/s eta 0:00:00
Downloading alembic-1.15.2-py3-none-any.whl (231 kB)
   ---------------------------------------- 0.0/231.9 kB ? eta -:--:--
   ---------------------------------------- 231.9/231.9 kB 7.2 MB/s 

In [9]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cat_features=["Sex","Ascites","Hepatomegaly","Spiders","Drug","Edema"]
# Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': 'CPU',
        'loss_function': 'MultiClass',
        'eval_metric': 'Accuracy',
        'verbose': 0,
        'early_stopping_rounds': 50
    }

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool)

    preds = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    return acc  # or use AUC, F1, etc.

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Best hyperparameters
print("Best trial:")
print(study.best_trial.params)

[I 2025-04-20 03:06:01,527] A new study created in memory with name: no-name-04f7286f-79ac-48e7-99cd-0e32c2f40a87
[I 2025-04-20 03:06:10,019] Trial 0 finished with value: 0.8613333333333333 and parameters: {'iterations': 1437, 'depth': 9, 'learning_rate': 0.18821426963176907, 'l2_leaf_reg': 3.735228639144792, 'random_strength': 0.38012446280654055, 'bagging_temperature': 0.16395043859877412, 'border_count': 88}. Best is trial 0 with value: 0.8613333333333333.
[I 2025-04-20 03:06:22,572] Trial 1 finished with value: 0.8633333333333333 and parameters: {'iterations': 527, 'depth': 9, 'learning_rate': 0.11745996933910825, 'l2_leaf_reg': 9.399389466511835, 'random_strength': 0.3180882774222834, 'bagging_temperature': 0.38153748772573803, 'border_count': 78}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-04-20 03:06:30,096] Trial 2 finished with value: 0.8633333333333333 and parameters: {'iterations': 972, 'depth': 8, 'learning_rate': 0.27939538259849866, 'l2_leaf_reg': 9.717049214

Best trial:
{'iterations': 854, 'depth': 4, 'learning_rate': 0.10631986112815074, 'l2_leaf_reg': 4.52158851680351, 'random_strength': 0.3522853081812742, 'bagging_temperature': 0.35343469064062744, 'border_count': 46}


In [10]:
best_params = study.best_trial.params
# Remove 'iterations' from best_params to avoid conflict
del best_params['iterations']
best_model = CatBoostClassifier(**best_params, iterations=study.best_trial.params['iterations'])
best_model.fit(X, y, cat_features=cat_features)

0:	learn: 0.9739826	total: 29.1ms	remaining: 24.8s
1:	learn: 0.8803769	total: 59.3ms	remaining: 25.3s
2:	learn: 0.8079240	total: 88.4ms	remaining: 25.1s
3:	learn: 0.7501219	total: 120ms	remaining: 25.4s
4:	learn: 0.7026166	total: 149ms	remaining: 25.3s
5:	learn: 0.6626142	total: 178ms	remaining: 25.1s
6:	learn: 0.6292072	total: 208ms	remaining: 25.1s
7:	learn: 0.6013158	total: 244ms	remaining: 25.8s
8:	learn: 0.5777724	total: 276ms	remaining: 25.9s
9:	learn: 0.5570693	total: 306ms	remaining: 25.8s
10:	learn: 0.5396477	total: 335ms	remaining: 25.7s
11:	learn: 0.5244262	total: 365ms	remaining: 25.6s
12:	learn: 0.5107379	total: 393ms	remaining: 25.4s
13:	learn: 0.4990588	total: 418ms	remaining: 25.1s
14:	learn: 0.4888483	total: 440ms	remaining: 24.6s
15:	learn: 0.4799259	total: 465ms	remaining: 24.4s
16:	learn: 0.4719775	total: 491ms	remaining: 24.1s
17:	learn: 0.4654652	total: 513ms	remaining: 23.8s
18:	learn: 0.4593977	total: 538ms	remaining: 23.7s
19:	learn: 0.4536327	total: 562ms	rema

<catboost.core.CatBoostClassifier at 0x1c41f641400>

In [11]:
# prompt: find logloss of the model using whole data X

from sklearn.metrics import log_loss

y_pred_proba_best = best_model.predict_proba(X)
logloss_score = log_loss(y, y_pred_proba)
print(f"Log Loss on the whole dataset: {logloss_score}")

Log Loss on the whole dataset: 0.3078127305058152


In [12]:
X_test = pd.read_csv('test.csv')

In [13]:
X_test['Drug'] = X_test['Drug'].fillna('NA')

X_test['Ascites'] = X_test['Ascites'].replace(np.nan, X['Ascites'].mode()[0])  # Replace 'NaN' string

X_test['Hepatomegaly']= X_test['Hepatomegaly'].replace( np.nan, X['Hepatomegaly'].mode()[0])  # Replace 'NaN' string

X_test['Spiders'] =X_test['Spiders'].replace(np.nan, X['Spiders'].mode()[0])  # Replace 'NaN' string

In [14]:
y_pred_catboost_best = best_model.predict_proba(X_test.drop(['id'],axis=1))

In [15]:
catboost_model = []

In [None]:
# prompt: concat X_test id in y_pred_test

import pandas as pd
import os
# Assuming y_pred_test and X_test are defined as in your provided code.

# Create a DataFrame with the predictions
submission_df = pd.DataFrame(y_pred_catboost_best, columns=['Status_C', 'Status_CL','Status_D'])

# Concatenate the 'id' column from X_test to the predictions DataFrame
submission_df['id'] = X_test['id'].values

# Reorder columns to have 'id' as the first column
submission_df = submission_df[['id', 'Status_C', 'Status_CL','Status_D']]

# Display the first few rows of the submission DataFrame
print(submission_df.head())

# Save the submission file
if os.path.exists('submission_catboost_def.csv'):
    os.remove('submission_catboost_def.csv')

submission_df.to_csv('submission_catboost_def.csv', index=False)


      id  Status_C  Status_CL  Status_D
0  15000  0.825501   0.013726  0.160772
1  15001  0.664887   0.003862  0.331251
2  15002  0.966961   0.020486  0.012553
3  15003  0.242029   0.638844  0.119128
4  15004  0.209235   0.000811  0.789954
