## Loading data

In [1]:
import joblib as jb

# loading training data: ADNI
X_train, y_train = jb.load("D:/Projetos Pessoais/ADPrediction/paper_code_and_files/saved_files/training_data.pkl")

# loading testing data: Centro de Diagnóstico por Imagem (CDI)
X_test, y_test = jb.load("D:/Projetos Pessoais/ADPrediction/paper_code_and_files/saved_files/testing_data.pkl")

In [2]:
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)

X_train.shape: (200, 176664)
X_test.shape: (192, 176664)


## Training and tuning hyperparameters of the model

In [3]:
from skopt.space import Real, Categorical
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# param_distribution for BayesSearch
param_distributions = {"LR__penalty": Categorical(["l1", "l2"]),
                      "LR__C": Real(1e-1, 1e3, prior="log-uniform"),
                      "PCA__n_components": Real(.5, .95)}

# pipeline with PCA and Logistic Regression
pipe = Pipeline([("PCA", PCA(whiten = True)),
                 ("LR", LogisticRegression(solver = "saga", n_jobs = -1, max_iter = 5000))])

# fitting GridSearch and saving the results
bs = BayesSearchCV(pipe, param_distributions, scoring = "f1", n_iter=50, verbose=10)
bs.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409
[CV 1/5; 1/1] END LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409;, score=0.919 total time=  15.5s
[CV 2/5; 1/1] START LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409
[CV 2/5; 1/1] END LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409;, score=0.833 total time=  15.4s
[CV 3/5; 1/1] START LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409
[CV 3/5; 1/1] END LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409;, score=0.778 total time=  12.9s
[CV 4/5; 1/1] START LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409
[CV 4/5; 1/1] END LR__C=0.146465666852577, LR__penalty=l2, PCA__n_components=0.7149920106857409;, score=0.821 total time=  13.5s
[CV 5/5; 1/1] START LR__C=0.14646



[CV 2/5; 1/1] END LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708;, score=0.789 total time=  12.3s
[CV 3/5; 1/1] START LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708
[CV 3/5; 1/1] END LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708;, score=0.818 total time=  12.8s
[CV 4/5; 1/1] START LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708
[CV 4/5; 1/1] END LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708;, score=0.762 total time=  12.7s
[CV 5/5; 1/1] START LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708
[CV 5/5; 1/1] END LR__C=1000.0, LR__penalty=l1, PCA__n_components=0.750274184723708;, score=1.000 total time=  12.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START LR__C=1.4288484094905765, LR__penalty=l1, PCA__n_components=0.95
[CV 1/5; 1/1] END LR__C=1.4288484094905765, LR__penalty=l1, PCA__n_components=0.95;, score=0.923 total time=  11.8s
[CV 2/5; 1/1] 

How does it perform on testing data?

In [4]:
from sklearn.metrics import classification_report

print(classification_report(y_test, bs.predict(X_test), digits=4))

              precision    recall  f1-score   support

           0     0.8969    0.8700    0.8832       100
           1     0.8632    0.8913    0.8770        92

    accuracy                         0.8802       192
   macro avg     0.8800    0.8807    0.8801       192
weighted avg     0.8807    0.8802    0.8803       192



Saving the final model for prediction:

In [5]:
jb.dump(bs, "bs_fitted.pkl")

['bs_fitted.pkl']