## Train

In [1]:
import pandas as pd
from pycaret.classification import setup, ClassificationExperiment, tune_model, create_model, compare_models, predict_model, evaluate_model

In [2]:
def prepare_data(df):
    expanded_df = pd.DataFrame(df['flux'].tolist(), index=df.index)
    df = df.drop(columns=['source_id','spectraltype_esphs','teff_gspphot','logg_gspphot','mh_gspphot','flux'])
    df = pd.concat([df, expanded_df], axis=1)
    return df

In [3]:
train_data = pd.read_parquet('../../../data/Gaia DR3/train.parquet')
test_data = pd.read_parquet('../../../data/Gaia DR3/test.parquet')

train_df = prepare_data(train_data)
test_df = prepare_data(test_data)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14101 entries, 7135 to 6174
Columns: 344 entries, Cat to 342
dtypes: float32(343), object(1)
memory usage: 18.7+ MB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3526 entries, 12306 to 14892
Columns: 344 entries, Cat to 342
dtypes: float32(343), object(1)
memory usage: 4.7+ MB


In [6]:
s = setup(train_df, target='Cat', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Cat
2,Target type,Binary
3,Target mapping,"LM: 0, M: 1"
4,Original data shape,"(14101, 344)"
5,Transformed data shape,"(14101, 344)"
6,Transformed train set shape,"(9870, 344)"
7,Transformed test set shape,"(4231, 344)"
8,Numeric features,343
9,Preprocess,True


In [7]:
exp = ClassificationExperiment()
exp.setup(train_df, target='Cat', session_id=123, train_size=0.99)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Cat
2,Target type,Binary
3,Target mapping,"LM: 0, M: 1"
4,Original data shape,"(14101, 344)"
5,Transformed data shape,"(14101, 344)"
6,Transformed train set shape,"(13959, 344)"
7,Transformed test set shape,"(142, 344)"
8,Numeric features,343
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x2a72c7d1480>

In [8]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9554,0.985,0.9554,0.9569,0.9557,0.906,0.907,2.108
xgboost,Extreme Gradient Boosting,0.9539,0.9848,0.9539,0.9552,0.9541,0.9027,0.9035,1.385
knn,K Neighbors Classifier,0.9489,0.9762,0.9489,0.9503,0.9492,0.8922,0.893,0.49
lda,Linear Discriminant Analysis,0.7453,0.9457,0.7453,0.7932,0.7077,0.3786,0.4598,0.184
nb,Naive Bayes,0.6499,0.9313,0.6499,0.6954,0.5481,0.0919,0.1787,0.121
lr,Logistic Regression,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.78
dt,Decision Tree Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.155
svm,SVM - Linear Kernel,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.162
ridge,Ridge Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,0.145
rf,Random Forest Classifier,0.6255,0.5,0.6255,0.3913,0.4814,0.0,0.0,2.564


  master_display_.apply(


In [9]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Predict

In [10]:
pred = predict_model(best, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9555,0.9867,0.9555,0.9573,0.9557,0.9062,0.9075


In [None]:
pred.head()

## Tune model

In [10]:
lgbc = create_model('lightgbm')
tuned_lgbc = tune_model(lgbc, search_library='optuna')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9564,0.9859,0.9564,0.958,0.9567,0.9081,0.9092
1,0.9402,0.9781,0.9402,0.9422,0.9406,0.874,0.8753
2,0.9595,0.9896,0.9595,0.9614,0.9597,0.9146,0.9161
3,0.9554,0.9828,0.9554,0.9571,0.9557,0.906,0.9072
4,0.9574,0.9836,0.9574,0.9585,0.9576,0.9101,0.9108
5,0.9574,0.9845,0.9574,0.9591,0.9577,0.9104,0.9116
6,0.9615,0.9858,0.9615,0.9627,0.9617,0.9187,0.9196
7,0.9595,0.9865,0.9595,0.9616,0.9597,0.9148,0.9165
8,0.9574,0.9885,0.9574,0.9583,0.9576,0.91,0.9105
9,0.9493,0.9843,0.9493,0.9501,0.9495,0.8927,0.8931


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9574,0.9859,0.9574,0.9591,0.9577,0.9103,0.9115
1,0.9433,0.9772,0.9433,0.9446,0.9435,0.8801,0.8809
2,0.9605,0.9881,0.9605,0.963,0.9608,0.917,0.919
3,0.9483,0.9798,0.9483,0.95,0.9486,0.891,0.8921
4,0.9554,0.9828,0.9554,0.9561,0.9556,0.9056,0.906
5,0.9585,0.9847,0.9585,0.96,0.9587,0.9125,0.9136
6,0.9635,0.9852,0.9635,0.9645,0.9637,0.9229,0.9236
7,0.9615,0.9877,0.9615,0.9636,0.9618,0.9191,0.9207
8,0.9595,0.9886,0.9595,0.9602,0.9596,0.9142,0.9146
9,0.9493,0.9832,0.9493,0.9501,0.9495,0.8927,0.8931


In [12]:
preds = predict_model(tuned_lgbc, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.956,0.9866,0.956,0.9574,0.9563,0.9072,0.9081


