In [1]:
%pip install lightgbm pytorch-tabnet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd 
import numpy as np 

import lightgbm as lgb

from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV




In [3]:
trd = pd.read_csv('train.csv')
ted = pd.read_csv('test.csv')

In [4]:
X = trd.drop(['Status'],axis=1)
y = trd['Status']


In [8]:
model = lgb.LGBMClassifier()

In [7]:
from sklearn.preprocessing import OrdinalEncoder

# Create a list of categorical columns to be encoded
categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

# Create an OrdinalEncoder instance
encoder = OrdinalEncoder()

# Fit the encoder on the training data for the specified columns
encoder.fit(X[categorical_cols])

# Transform both training and test data using the fitted encoder
X[categorical_cols] = encoder.transform(X[categorical_cols])
ted[categorical_cols] = encoder.transform(ted[categorical_cols])

In [14]:
param_grid = {
    'num_leaves': [30, 50, 100],
    'learning_rate': [0.01, 0.1,0.2, 0.5],
    'n_estimators': [100, 500, 1000],
}

# hyper parameter tuning 
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)  
grid_search.fit(X, y)  

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2228
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 19
[LightGBM] [Info] Start training from score -0.395416
[LightGBM] [Info] Start training from score -3.724161
[LightGBM] [Info] Start training from score -1.195784
Best parameters found:  {'learning_rate': 0.01, 'n_estimators': 100, 'num_leaves': 100}
Best score:  0.8470666666666666


In [16]:
best_model_lgbm = lgb.LGBMClassifier(**grid_search.best_params_)
best_model_lgbm.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2228
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 19
[LightGBM] [Info] Start training from score -0.395416
[LightGBM] [Info] Start training from score -3.724161
[LightGBM] [Info] Start training from score -1.195784


In [17]:
y_pred_proba_LGBM = best_model_lgbm.predict_proba(ted)

In [18]:
def create_submission_file(y_pred_proba,X_test,model):
  submission_df = pd.DataFrame(y_pred_proba, columns=['Status_C', 'Status_CL','Status_D'])

  # Concatenate the 'id' column from X_test to the predictions DataFrame
  submission_df['id'] = X_test['id'].values

  # Reorder columns to have 'id' as the first column
  submission_df = submission_df[['id', 'Status_C', 'Status_CL','Status_D']]

  # Display the first few rows of the submission DataFrame
  print(submission_df.head())

  # Save the submission file
  submission_df.to_csv(f'submission_{model}.csv', index=False)

In [19]:
create_submission_file(y_pred_proba=y_pred_proba_LGBM,X_test=ted,model='LightGBM')

      id  Status_C  Status_CL  Status_D
0  15000  0.844955   0.021259  0.133786
1  15001  0.716081   0.013248  0.270672
2  15002  0.887399   0.012985  0.099616
3  15003  0.427267   0.355488  0.217245
4  15004  0.230409   0.011657  0.757934


In [8]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder

# Install pytorch_tabnet if needed (uncomment if not already installed)
# %pip install pytorch-tabnet

y_train = y
X_train = X
X_test = ted
# Encode the target labels for TabNet training
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# Convert DataFrames to numpy arrays and ensure the data type is float32 for TabNet
X_train_np = X_train.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)

# Define and train the TabNet classifier
tabnet_model = TabNetClassifier()

tabnet_model.fit(
    X_train_np, y_train_enc,
    eval_set=[(X_train_np, y_train_enc)],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Predict probabilities on the test set
y_pred_proba_tabnet = tabnet_model.predict_proba(X_test_np)

# Map the predicted probabilities to columns.
# If the number of classes is two, we create two status columns; otherwise, we use three.
if len(le.classes_) == 2:
    columns = [f'Status_{cls}' for cls in le.classes_]
else:
    columns = ['Status_C', 'Status_CL', 'Status_D']

# Create a submission DataFrame using the same format as before
submission_df = pd.DataFrame(y_pred_proba_tabnet, columns=columns)
submission_df['id'] = X_test['id'].values
submission_df = submission_df[['id'] + columns]

print(submission_df.head())
submission_df.to_csv('submission_TabNet.csv', index=False)



ValueError: Input contains NaN.