# export data

In [81]:
import pandas as pd
import numpy as np

aseg_data = pd.read_csv('../dataset/final_data/aseg+DKT.stats980.csv')
aseg_data.head()

Unnamed: 0,Group,Age,Sex,Volume_mm30,normMean0,normStdDev0,normMin0,normMax0,Volume_mm31,normMean1,...,Volume_mm398,normMean98,normStdDev98,normMin98,normMax98,Volume_mm399,normMean99,normStdDev99,normMin99,normMax99
0,AD,78,M,239618.359,104.5263,8.7939,23.0,133.0,19019.868,21.8726,...,617.654,78.3675,12.5697,49.0,101.0,4935.454,70.7624,9.5176,42.0,95.0
1,AD,66,M,244462.62,104.4633,9.0946,19.0,135.0,52376.076,18.9785,...,866.656,81.6065,11.9081,49.0,106.0,5642.185,71.9476,9.2368,45.0,97.0
2,AD,77,M,236413.264,104.6093,8.0282,28.0,133.0,16591.926,27.7462,...,928.867,84.8252,10.9218,55.0,101.0,5959.229,76.6012,8.9234,50.0,99.0
3,AD,73,M,227601.449,104.3662,8.7423,29.0,132.0,19109.596,20.5128,...,576.647,80.354,12.4069,51.0,105.0,5598.818,71.2126,8.7279,43.0,97.0
4,AD,62,M,220511.415,104.5355,7.3271,40.0,129.0,4690.35,36.5242,...,724.538,86.711,10.1832,61.0,103.0,5327.887,76.5429,7.5031,51.0,96.0


In [84]:
aseg_X = aseg_data.iloc[:, 3:]
aseg_y = aseg_data.iloc[:, 0]
X, y = aseg_X.to_numpy(), aseg_y.to_numpy()

In [85]:
X[:5 ,:5]

array([[2.39618359e+05, 1.04526300e+02, 8.79390000e+00, 2.30000000e+01,
        1.33000000e+02],
       [2.44462620e+05, 1.04463300e+02, 9.09460000e+00, 1.90000000e+01,
        1.35000000e+02],
       [2.36413264e+05, 1.04609300e+02, 8.02820000e+00, 2.80000000e+01,
        1.33000000e+02],
       [2.27601449e+05, 1.04366200e+02, 8.74230000e+00, 2.90000000e+01,
        1.32000000e+02],
       [2.20511415e+05, 1.04535500e+02, 7.32710000e+00, 4.00000000e+01,
        1.29000000e+02]])

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42)

# SVC

## train and test

we use the grid search to find the best hyperparameters for the SVC.

In [40]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly','sigmoid'],
    'svc__random_state': [42]
}
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

In [41]:
grid_search.fit(X, y)

In [42]:
grid_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_svc__C', 'param_svc__kernel', 'param_svc__random_state', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [43]:
grid_search.best_score_

0.7275510204081632

In [44]:
grid_search.cv_results_['mean_test_score']

array([0.62755102, 0.48673469, 0.48979592, 0.48673469, 0.62346939,
       0.67959184, 0.60714286, 0.54795918, 0.62346939, 0.71020408,
       0.72653061, 0.48571429, 0.62346939, 0.71020408, 0.72755102,
       0.49285714])

In [45]:
grid_search.best_estimator_

------------------

In [67]:
param_poly = {
    'svc__C': [0.1, 1, 10, 100, 1000],
    'svc__kernel': ['linear'],
}
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])
grid_search = GridSearchCV(pipeline, param_poly, cv=5, n_jobs=-1)
grid_search.fit(X, y)

In [69]:
grid_search.best_score_

0.6275510204081632

In [68]:
best_params = grid_search.best_params_
best_params

{'svc__C': 0.1, 'svc__kernel': 'linear'}

we use the best parameters of linear SVC finding above, to train the model on training data and analyse the parameters.

In [70]:
from sklearn.metrics import accuracy_score

svc = SVC(C=0.1, kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6224489795918368


In [87]:
svc.classes_, svc.coef_.shape

(array(['AD', 'CN', 'MCI'], dtype=object), (3, 500))

In [89]:
# find which attributes own largest weight on AD
w_attr = zip(svc.coef_[0], aseg_X.columns)
sorted(w_attr, key=lambda x: abs(x[0]), reverse=True)

[(-0.1662841498368754, 'Volume_mm312'),
 (-0.14776540291003337, 'normStdDev87'),
 (-0.14599158430077533, 'normMax73'),
 (-0.13845883666272807, 'Volume_mm341'),
 (0.1327477144900283, 'normMax16'),
 (0.13173491157180367, 'normMax11'),
 (-0.13006225115821396, 'normMax24'),
 (0.12985747891155924, 'normMax31'),
 (0.1296254031279354, 'normMax59'),
 (-0.127231284725164, 'Volume_mm351'),
 (-0.12497232891893774, 'Volume_mm327'),
 (0.1239827161528593, 'normMin0'),
 (-0.12200985341201695, 'normMin12'),
 (0.12063714375876768, 'normMean28'),
 (-0.12054150897538729, 'normMax92'),
 (0.1189593635713109, 'Volume_mm369'),
 (-0.1170276662942751, 'normStdDev54'),
 (-0.11583067477982993, 'normMean52'),
 (0.1156160993322546, 'normStdDev50'),
 (0.11405007825061017, 'normMin20'),
 (-0.11101516199833708, 'normMean71'),
 (0.11081350457132305, 'normMin30'),
 (0.11010052845184659, 'normStdDev44'),
 (0.10982208032258972, 'normMean13'),
 (0.10807582405298174, 'normMin3'),
 (0.10751441053089134, 'normStdDev27'),
 (-

# MLP


In [114]:
# change string label to integer
np.unique(y_train)

array(['AD', 'CN', 'MCI'], dtype=object)

In [138]:
def label_str2int(y):
    y_int  = np.where(y=='AD', 0, 
                np.where(y=='CN', 1, 
                    np.where(y=='MCI', 2, None)))
    return y_int.astype(int)

y_train_int = label_str2int(y_train)
y_test_int = label_str2int(y_test)
y_int = label_str2int(y)

In [141]:
y_train_int.shape, X_train.shape, y_test_int.shape, X_test.shape

((784,), (784, 500), (196,), (196, 500))

In [143]:
import tensorflow as tf

mlp_pipeline = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(1024, activation="relu", input_shape=(500,)),
        tf.keras.layers.Dense(512, activation="relu"),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(3, activation="softmax"),
    ]
)

mlp_pipeline.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"],
)
mlp_pipeline.fit(
    X_train,
    y_train_int,
    epochs=100,
    batch_size=32,
    validation_data=(X_test, y_test_int),
)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.4897 - loss: 1.0607 - val_accuracy: 0.5816 - val_loss: 0.8374
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6595 - loss: 0.7212 - val_accuracy: 0.6480 - val_loss: 0.8172
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7587 - loss: 0.5302 - val_accuracy: 0.6276 - val_loss: 0.9771
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8842 - loss: 0.3164 - val_accuracy: 0.5816 - val_loss: 1.2948
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8920 - loss: 0.2959 - val_accuracy: 0.6735 - val_loss: 1.1737
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9323 - loss: 0.1919 - val_accuracy: 0.6735 - val_loss: 1.6232
Epoch 7/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7efb354b92d0>

In [148]:
np.min(mlp_pipeline.predict(X_test)), np.max(mlp_pipeline.predict(X_test))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


(1.1961978e-26, 1.0)