## Train

In [1]:
import pandas as pd
from pycaret.classification import setup, ClassificationExperiment, tune_model, create_model, compare_models, predict_model, dashboard

In [2]:
def prepare_data(df):
    expanded_df = pd.DataFrame(df['flux'].tolist(), index=df.index)
    df = df.drop(columns=['source_id','spectraltype_esphs','teff_gspphot','logg_gspphot','mh_gspphot','flux'])
    df = pd.concat([df, expanded_df], axis=1)
    return df

In [3]:
train_data = pd.read_parquet('../../../data/Gaia DR3/train.parquet')
test_data = pd.read_parquet('../../../data/Gaia DR3/test.parquet')

train_df = prepare_data(train_data)
test_df = prepare_data(test_data)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14101 entries, 0 to 14100
Columns: 344 entries, Cat to 342
dtypes: float32(343), object(1)
memory usage: 18.6+ MB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3526 entries, 0 to 3525
Columns: 344 entries, Cat to 342
dtypes: float32(343), object(1)
memory usage: 4.6+ MB


In [6]:
s = setup(train_df, target='Cat', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Cat
2,Target type,Binary
3,Target mapping,"LM: 0, M: 1"
4,Original data shape,"(14101, 344)"
5,Transformed data shape,"(14101, 344)"
6,Transformed train set shape,"(9870, 344)"
7,Transformed test set shape,"(4231, 344)"
8,Numeric features,343
9,Preprocess,True


In [7]:
exp = ClassificationExperiment()
exp.setup(train_df, target='Cat', session_id=123, train_size=0.99)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Cat
2,Target type,Binary
3,Target mapping,"LM: 0, M: 1"
4,Original data shape,"(14101, 344)"
5,Transformed data shape,"(14101, 344)"
6,Transformed train set shape,"(13959, 344)"
7,Transformed test set shape,"(142, 344)"
8,Numeric features,343
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x2c6de0f0610>

In [8]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.958,0.9877,0.958,0.9591,0.9581,0.9112,0.9119,1.31
xgboost,Extreme Gradient Boosting,0.9568,0.9882,0.9568,0.9579,0.957,0.9088,0.9095,1.333
knn,K Neighbors Classifier,0.9519,0.9795,0.9519,0.9527,0.952,0.8981,0.8986,0.611
lda,Linear Discriminant Analysis,0.7376,0.9516,0.7376,0.792,0.6954,0.3555,0.4443,0.177
nb,Naive Bayes,0.6526,0.9342,0.6526,0.7053,0.552,0.0992,0.1913,0.123
lr,Logistic Regression,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.902
dt,Decision Tree Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.151
svm,SVM - Linear Kernel,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.155
ridge,Ridge Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.126
rf,Random Forest Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,2.415


## Tune model

In [9]:
lgbc = create_model('lightgbm')
tuned_lgbc = tune_model(lgbc, search_library='optuna')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9544,0.9857,0.9544,0.9556,0.9546,0.9036,0.9044
1,0.9595,0.9892,0.9595,0.9614,0.9597,0.9146,0.9161
2,0.9554,0.9843,0.9554,0.9571,0.9557,0.906,0.9072
3,0.9402,0.9824,0.9402,0.9417,0.9405,0.8738,0.8747
4,0.9645,0.9916,0.9645,0.9654,0.9647,0.925,0.9256
5,0.9585,0.9881,0.9585,0.9598,0.9587,0.9124,0.9133
6,0.9645,0.9898,0.9645,0.9651,0.9647,0.9249,0.9252
7,0.9615,0.989,0.9615,0.9623,0.9617,0.9186,0.9191
8,0.9625,0.9888,0.9625,0.9628,0.9626,0.9203,0.9204
9,0.9585,0.9883,0.9585,0.9598,0.9587,0.9124,0.9133


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9504,0.985,0.9504,0.9514,0.9506,0.8949,0.8955
1,0.9615,0.9905,0.9615,0.9631,0.9617,0.9188,0.92
2,0.9595,0.9862,0.9595,0.9611,0.9597,0.9146,0.9158
3,0.9463,0.9836,0.9463,0.9487,0.9467,0.8871,0.8888
4,0.9635,0.9909,0.9635,0.9643,0.9637,0.9228,0.9234
5,0.9554,0.9892,0.9554,0.9573,0.9557,0.9062,0.9076
6,0.9645,0.9909,0.9645,0.965,0.9646,0.9248,0.925
7,0.9574,0.9884,0.9574,0.9587,0.9577,0.9102,0.911
8,0.9564,0.9877,0.9564,0.9565,0.9565,0.9072,0.9072
9,0.9605,0.9891,0.9605,0.962,0.9607,0.9167,0.9178


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [10]:
preds = predict_model(tuned_lgbc, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9541,0.9849,0.9541,0.9556,0.9543,0.9031,0.9042


## Predict

In [11]:
pred = predict_model(best, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9541,0.9849,0.9541,0.9556,0.9543,0.9031,0.9042


In [12]:
pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,336,337,338,339,340,341,342,Cat,prediction_label,prediction_score
0,2.1406080000000003e-17,1.3652760000000001e-17,8.998306e-18,9.277905e-18,1.1337640000000001e-17,1.2368340000000001e-17,1.2105120000000001e-17,1.10484e-17,1.0058490000000001e-17,1.006457e-17,...,1.888428e-17,1.936134e-17,2.002518e-17,2.0374340000000003e-17,2.1277490000000003e-17,2.1454530000000003e-17,2.2106740000000002e-17,LM,LM,0.9998
1,5.743483e-16,5.637122e-16,5.316359e-16,5.080654e-16,5.123901e-16,5.20331e-16,5.11685e-16,4.899089e-16,4.942565e-16,5.246064e-16,...,1.732059e-16,1.707564e-16,1.708296e-16,1.701658e-16,1.769846e-16,1.812655e-16,1.934178e-16,LM,M,0.6774
2,9.540365e-16,9.370028e-16,8.942922e-16,8.675982e-16,8.753908e-16,8.860598e-16,8.863876e-16,8.799443e-16,9.082285e-16,9.587669e-16,...,8.855333e-16,9.017898e-16,9.214689e-16,9.221377e-16,9.441346e-16,9.314883e-16,9.383808e-16,M,M,0.9349
3,4.32273e-17,3.5990280000000005e-17,3.116138e-17,3.1473550000000005e-17,3.3802420000000004e-17,3.458265e-17,3.3154800000000005e-17,3.06056e-17,3.091396e-17,3.6155210000000004e-17,...,1.638334e-17,1.512725e-17,1.40843e-17,1.3167530000000001e-17,1.3205540000000002e-17,1.3594840000000002e-17,1.5231970000000003e-17,LM,LM,0.9998
4,1.134171e-15,1.104958e-15,1.022256e-15,9.641852e-16,9.621754e-16,9.662636e-16,9.512777e-16,9.316988e-16,9.557092e-16,9.853166e-16,...,1.455515e-16,1.424176e-16,1.418621e-16,1.413263e-16,1.477437e-16,1.527918e-16,1.65161e-16,M,M,0.9506


## Dashboard

In [13]:
dashboard(lgbc, display_format ='inline')

Note: model_output=='probability'. For LGBMClassifier shap values normally get calculated against X_background, but paramater X_background=None, so using X instead
Generating self.shap_explainer = shap.TreeExplainer(model, X, model_output='probability', feature_perturbation='interventional')...
Note: Shap interaction values will not be available. If shap values in probability space are not necessary you can pass model_output='logodds' to get shap values in logodds without the need for a background dataset and also working shap interaction values...
Building ExplainerDashboard..
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...




Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8050))
