In [1]:
from trainer import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
train_df = pd.read_csv('train.csv').drop(["efs_time"], axis=1)
test_df = pd.read_csv('test.csv')
train_df

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,28795,Intermediate - TED AML case <missing cytogenetics,,Favorable,No,2.0,8.0,No TBI,No,6.0,...,,Bi-directional non-permissive,,"N/A, Mel not given",8.0,,2.0,No,10.0,0.0
28796,28796,High,No,Poor,Yes,1.0,4.0,No TBI,No,5.0,...,No,GvH non-permissive,Related,"N/A, Mel not given",6.0,Yes,1.0,Yes,8.0,1.0
28797,28797,TBD cytogenetics,,Poor,,2.0,8.0,No TBI,,6.0,...,,GvH non-permissive,Unrelated,"N/A, Mel not given",8.0,,2.0,No,10.0,0.0
28798,28798,N/A - non-malignant indication,No,Poor,No,1.0,4.0,No TBI,No,3.0,...,No,Permissive mismatched,Related,MEL,4.0,No,1.0,No,5.0,0.0


In [3]:
train_df, eval_df = train_test_split(train_df, test_size=0.1, random_state=42)
train_df

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs
14856,14856,High,No,Favorable,No,2.0,7.0,TBI + Cy +- Other,,5.0,...,No,HvG non-permissive,Related,"N/A, Mel not given",7.0,No,2.0,No,7.0,1.0
15925,15925,N/A - pediatric,No,Favorable,,2.0,8.0,No TBI,No,6.0,...,,Permissive mismatched,Multiple donor (non-UCB),"N/A, Mel not given",8.0,No,2.0,No,10.0,1.0
8528,8528,Low,No,,No,,,TBI + Cy +- Other,No,3.0,...,No,,Related,MEL,5.0,No,,No,6.0,1.0
21845,21845,N/A - non-malignant indication,No,Poor,No,1.0,5.0,"TBI +- Other, <=cGy",No,4.0,...,No,HvG non-permissive,Related,MEL,5.0,No,1.0,No,6.0,1.0
23607,23607,High - TED AML case <missing cytogenetics,No,Intermediate,No,2.0,,No TBI,No,6.0,...,No,,Related,"N/A, Mel not given",8.0,No,2.0,Not done,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,21575,Intermediate,No,Poor,No,,,"TBI +- Other, >cGy",No,,...,Yes,,Related,"N/A, Mel not given",,,,No,,0.0
5390,5390,Low,No,Intermediate,No,2.0,6.0,No TBI,No,5.0,...,Yes,Permissive mismatched,Related,"N/A, Mel not given",7.0,No,1.0,Yes,9.0,0.0
860,860,Intermediate,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,No,Permissive mismatched,Unrelated,MEL,8.0,No,2.0,No,10.0,0.0
15795,15795,TBD cytogenetics,No,Poor,No,,,No TBI,No,,...,No,,Related,"N/A, Mel not given",,No,,No,,0.0


In [4]:
preprocessor = PreprocessingTool(val_folds = True,
                                 n_folds = 5,
                                 seed = 42,
                                 prob_type="regression",
                                 drop_non_categorical_text=False
                                 )

train_dataset = TabularDataset(train_df,
                               label = "efs",
                               preprocessor = preprocessor,
                               type = "train")
eval_dataset = TabularDataset(eval_df,
                              label = "efs",
                              preprocessor = preprocessor,
                              type = "eval")
test_dataset = TabularDataset(test_df,
                              preprocessor = preprocessor,
                              type = "infer")

trainer = Trainer(train_dataset,
                  eval_dataset = eval_dataset,
                  eval_metric = "rmse",
                  models = [
                            # "LGB", 
                            # "XGB", "CAT", 
                            # "RF",
                            "SGD_LINEAR"
                            ],
                  early_stopping_rounds = 0,
                  of_mitigation_level = 0.2,
                  use_gpu = True,
                  use_cuda = True,
                  n_trials = 1000,
                  timeout = 200,
                  meta_timeout = 100,
                  seed = 42,
                  select_top = 3,
                  train_meta = True)

trainer.train()

LABEL: efs
Problem type: regression. 
-----------------------------------------------------------------------------------------------------------------------------------------
IMPUTER:

Done. 
-----------------------------------------------------------------------------------------------------------------------------------------
DTYPES DETECTOR:

Found 3 unique raw np.dtype(s): [dtype('int64') dtype('O') dtype('float64')].
Dropped 1 column(s) with index ID / non-categorical text features.
Converted 0 column(s) to numeric types.
Remaining features: 1 int feat(s), 21 float feat(s), 35 categorical feat(s).
-----------------------------------------------------------------------------------------------------------------------------------------
5-FOLD STRATIFIEDKFOLDREG TRAIN-VAL SPLITTER:

Done.
-----------------------------------------------------------------------------------------------------------------------------------------
SCALER:

Done.
---------------------------------------------

  0%|          | 0/1000 [00:00<?, ?it/s]

Trial 0 finished with values: {'mean_train_rmse': 0.6001455637934189, 'mean_val_rmse': 0.5997810906333515, 'optimized_metric': 0.5998539852653649},
and parameters: {'alpha': 0.05302575578123925, 'penalty': 'elasticnet', 'l1_ratio': 0.28266516279209064, 'learning_rate': 'invscaling', 'eta0': 0.05872819412991352, 'power_t': 0.4809412746716797, 'max_iter': 1575, 'tol': 4.841088404394437e-05, 'shuffle': False, 'early_stopping': False, 'n_iter_no_change': 6}.

Trial 1 finished with values: {'mean_train_rmse': 0.5598011031708087, 'mean_val_rmse': 0.5667542359256589, 'optimized_metric': 0.5681448624766289},
and parameters: {'alpha': 1.929475747073748e-06, 'penalty': 'l1', 'l1_ratio': 0.7187692040615182, 'learning_rate': 'adaptive', 'eta0': 0.011811476722751488, 'power_t': 0.30523241276642055, 'max_iter': 945, 'tol': 2.7091302711408002e-05, 'shuffle': True, 'early_stopping': False, 'n_iter_no_change': 10}.

Trial 2 finished with values: {'mean_train_rmse': 32662428.590611167, 'mean_val_rmse': 

Unnamed: 0,id,mean_train_rmse,mean_val_rmse,optimized_metric,params_alpha,params_early_stopping,params_eta0,params_l1_ratio,params_learning_rate,params_max_iter,params_n_iter_no_change,params_penalty,params_power_t,params_shuffle,params_tol,datetime_start,datetime_complete,duration,state,model
0,SGD_LINEAR_6,0.564126,0.566724,0.567244,2e-06,False,0.002236,0.660004,invscaling,883,4,elasticnet,0.353894,True,5.1e-05,2024-12-14 22:46:26.489056,2024-12-14 22:46:29.861363,0 days 00:00:03.372307,COMPLETE,"SGDRegressor(alpha=2.4017580789420376e-06, eta..."
1,SGD_LINEAR_1,0.559801,0.566754,0.568145,2e-06,False,0.011811,0.718769,adaptive,945,10,l1,0.305232,True,2.7e-05,2024-12-14 22:43:44.418894,2024-12-14 22:44:04.496646,0 days 00:00:20.077752,COMPLETE,"SGDRegressor(alpha=1.929475747073748e-06, eta0..."
2,SGD_LINEAR_3,0.577023,0.577314,0.577372,0.007207,True,0.000138,0.846961,constant,1844,5,elasticnet,0.438216,False,2.1e-05,2024-12-14 22:45:56.216881,2024-12-14 22:46:02.796366,0 days 00:00:06.579485,COMPLETE,"SGDRegressor(alpha=0.007207180438482916, early..."


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 


TRAINING ELASTICNET META-LEARNER ...

Retraining best found models on each fold and generating oof preds ...
Training meta-learner using Optuna...
Done:
Number of models tried by Optuna: 1622.
Best ElasticNet parameters: {'alpha': 0.007691907700707261, 'l1_ratio': 0.20253576171765642, 'max_iter': 6634}
Best rmse score: -0.5643361608359007
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Unnamed: 0,id,mean_train_rmse,mean_val_rmse,optimized_metric,model
0,SGD_LINEAR_6,0.564126,0.566724,0.567244,"SGDRegressor(alpha=2.4017580789420376e-06, eta..."
1,SGD_LINEAR_1,0.559801,0.566754,0.568145,"SGDRegressor(alpha=1.929475747073748e-06, eta0..."
2,SGD_LINEAR_3,0.577023,0.577314,0.577372,"SGDRegressor(alpha=0.007207180438482916, early..."


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 


Generated ensemble model. To use it, call .predict() / .predict_proba() method on trainer object.


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 


External Evaluation LEADERBOARD:


Unnamed: 0,id,model,custom_rmse,mae,mse,rmse,r2
0,VotingEnsemble,"VotingRegressor(estimators=[('SGD_LINEAR_6',\n...",0.571001,0.420213,0.206721,0.454666,0.170016
1,SGD_LINEAR_1,"SGDRegressor(alpha=1.929475747073748e-06, eta0...",0.572822,0.40952,0.206123,0.454007,0.172419
2,MetaLearner,"ElasticNet(alpha=0.007691907700707261, l1_rati...",0.572822,0.41813,0.20618,0.454071,0.172187
3,SGD_LINEAR_6,"SGDRegressor(alpha=2.4017580789420376e-06, eta...",0.574638,0.41466,0.206753,0.454701,0.169887
4,SGD_LINEAR_3,"SGDRegressor(alpha=0.007207180438482916, early...",0.57735,0.438999,0.214039,0.462643,0.140636


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 


Retraining best found models on whole available data...
Done. Trainer is ready for inference and saved at path: None
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 




In [5]:
processed_test_data = test_dataset.process()
processed_test_data

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,...,donor_related_Related,donor_related_Unrelated,melphalan_dose_MEL,melphalan_dose_N_A__Mel_not_given,cardiac_No,cardiac_Not_done,cardiac_Yes,pulm_moderate_No,pulm_moderate_Not_done,pulm_moderate_Yes
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [6]:
preds = trainer.predict(processed_test_data, mode="meta")
preds

array([0.17022137, 0.55481032, 0.1286355 ])

In [8]:
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "prediction": preds
})
submission

Unnamed: 0,ID,prediction
0,28800,0.170221
1,28801,0.55481
2,28802,0.128636


In [9]:
submission.to_csv("submission_meta.csv", index=False)