# Sex Prediction with `TPOT`

In [1]:
from brainprint.recon_all.execution_configuration import ExecutionConfiguration
from brainprint.atlas import Atlas
from brainprint.protocol import Protocol
from brainprint.recon_all.results import ReconAllResults
from tpot import TPOTClassifier
from tpot import decorators
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
decorators.MAX_EVAL_SECS = 600 # Required to prevent stopit.utils.TimeoutException

In [2]:
configurations = [
    ExecutionConfiguration.DEFAULT, 
    ExecutionConfiguration.T2, 
    ExecutionConfiguration.FLAIR, 
    ExecutionConfiguration.MPRAGE_AND_3T_AND_FLAIR, 
    ExecutionConfiguration.MPRAGE_AND_3T_AND_T2
]
results = ReconAllResults(configuration=configurations, 
                          atlas=Atlas.DESTRIEUX, 
                          protocol=Protocol.BASE, 
                          completed_only=True, 
                          multi_only=False, 
                          questionnaire_only=False)

[2022-08-02 09:32:21,770] [3mSuccessfully read [1m5403[22m recon-all execution results from /home/zvi/Projects/brainprint/data/results.csv.[23m
[2022-08-02 09:32:21,790] [3mSuccessfully read [1m5403[22m recon-all execution configurations from /home/zvi/Projects/brainprint/data/configurations.csv.[23m
[2022-08-02 09:32:21,834] [3mSuccessfully read [1m5403[22m scan research context and metadata from /home/zvi/Projects/brainprint/data/context.csv.[23m
[2022-08-02 09:32:22,238] [44mFiltering 5403 recon-all results[0m
[2022-08-02 09:32:22,240] [1m2919[22m/5403 recon-all results matching [95m['The Base (corrected)'][0m detected.
[2022-08-02 09:32:22,244] [1m2221[22m/2919 recon-all results matching [96m['Default', 'T2', 'FLAIR', 'FLAIR + MPRAGE + 3T', 'T2 + MPRAGE + 3T'][0m detected.
[2022-08-02 09:32:22,251] Successfully selected [1m1520 runs[22m from [1m304 scans[22m with all [3m5[23m recon-all execution configuration results.
[2022-08-02 09:32:22,254] [1m[92m1

In [None]:
TPOT_CONFIGURATION = {
    "generations": 40,
    "population_size": 100,
    "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    "max_time_mins": 60 * 6,
    "random_state": 0, 
    "verbosity": 2, 
    "use_dask": True, 
    "scoring": "roc_auc", 
    "n_jobs": 12,
    "periodic_checkpoint_folder": "/home/zvi/Projects/brainprint/src/brainprint/recon_all/sex/tpot/checkpoints",
    "warm_start": True,
}
METRICS = [
    "Average Thickness",
    "Gray Matter Volume",
    "Surface Area",
]
       # 'Thickness StdDev']
       # 'Folding Index',
       # 'Integrated Rectified Gaussian Curvature',
       # 'Integrated Rectified Mean Curvature',
       # 'Intrinsic Curvature Index']
scores = {}
encoder = LabelEncoder()
for execution_configuration in results.configuration:
    X_train, X_test, y_train, y_test = results.split(execution_configuration=execution_configuration, 
                                                     metrics=METRICS,
                                                     single_mode="last", 
                                                     test_size=0.2, 
                                                     random_state=0)
    
    # Encode sex as 0 or 1 and apply encoding to all consequtive target vectors.
    try:
        y_train = encoder.transform(y_train)
    except NotFittedError:
        y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    
    # Train classifier for the current execution configuration.
    run_id = f"TPOT_{execution_configuration.name}"
    classifier = TPOTClassifier(log_file=f"./tpot/logs/{run_id}.log", **TPOT_CONFIGURATION)
    classifier.fit(X_train, y_train)
    
    # Report train and test scores.
    train_score = classifier.score(X_train, y_train)
    test_score = classifier.score(X_test, y_test)
    scores[execution_configuration] = test_score
    print(f"Train score for {execution_configuration}: {train_score}")
    print(f"Test score for {execution_configuration}: {test_score}\n\n\n")
    
    # Export pipeline.
    classifier.export(f"./tpot/pipelines/{run_id}.py")

[2022-08-02 09:32:22,305] Successfully selected [1m830[22m runs from a total of [1m166[22m scans belonging to [1m166[22m subjects.
[2022-08-02 09:32:22,307] [44mFiltering 1518 recon-all results[0m
[2022-08-02 09:32:22,310] [1m304[22m/1518 recon-all results matching [96m['Default'][0m detected.
[2022-08-02 09:32:22,312] Successfully selected [1m132[22m/[1m304[22m runs from a total of [1m132[22m scans belonging to [1m132[22m subjects.
[2022-08-02 09:32:22,315] [1m[92m132[0m[22m/1518 recon-all runs selected.
[2022-08-02 09:32:22,317] [44mFiltering 1518 recon-all results[0m
[2022-08-02 09:32:22,320] [1m304[22m/1518 recon-all results matching [96m['Default'][0m detected.
[2022-08-02 09:32:22,322] Successfully selected [1m34[22m/[1m304[22m runs from a total of [1m34[22m scans belonging to [1m34[22m subjects.
[2022-08-02 09:32:22,324] [1m[92m34[0m[22m/1518 recon-all runs selected.


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: LogisticRegression(RobustScaler(RobustScaler(BernoulliNB(OneHotEncoder(SGDClassifier(SelectPercentile(SGDClassifier(input_matrix, alpha=0.001, eta0=0.1, fit_intercept=True, l1_ratio=1.0, learning_rate=invscaling, loss=perceptron, penalty=elasticnet, power_t=0.1), percentile=80), alpha=0.0, eta0=1.0, fit_intercept=True, l1_ratio=1.0, learning_rate=invscaling, loss=hinge, penalty=elasticnet, power_t=0.5), minimum_fraction=0.05, sparse=False, threshold=10), alpha=10.0, fit_prior=True))), C=25.0, dual=False, penalty=l2)
Train score for ExecutionConfiguration.DEFAULT: 1.0
Test score for ExecutionConfiguration.DEFAULT: 0.8385964912280701



[2022-08-02 16:13:42,361] Successfully selected [1m830[22m runs from a total of [1m166[22m scans belonging to [1m166[22m subjects.
[2022-08-02 16:13:42,362] [44mFiltering 1518 recon-all results[0m
[2022-08-02 16:13:42,365] [1m304[22m/1518 recon-all results matching [96m['T2'][0m detected.
[2022-08-02 16:13:42,367] Successfully 

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: MLPClassifier(PCA(MinMaxScaler(input_matrix), iterated_power=9, svd_solver=randomized), alpha=0.0001, learning_rate_init=0.001)
Train score for ExecutionConfiguration.T2: 1.0
Test score for ExecutionConfiguration.T2: 0.8456140350877193



[2022-08-02 22:42:00,913] Successfully selected [1m830[22m runs from a total of [1m166[22m scans belonging to [1m166[22m subjects.
[2022-08-02 22:42:00,915] [44mFiltering 1518 recon-all results[0m
[2022-08-02 22:42:00,917] [1m304[22m/1518 recon-all results matching [96m['FLAIR'][0m detected.
[2022-08-02 22:42:00,919] Successfully selected [1m132[22m/[1m304[22m runs from a total of [1m132[22m scans belonging to [1m132[22m subjects.
[2022-08-02 22:42:00,922] [1m[92m132[0m[22m/1518 recon-all runs selected.
[2022-08-02 22:42:00,924] [44mFiltering 1518 recon-all results[0m
[2022-08-02 22:42:00,927] [1m304[22m/1518 recon-all results matching [96m['FLAIR'][0m detected.
[2022-08-02 22:42:00,930] Successfully selec

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: LinearSVC(DecisionTreeClassifier(StandardScaler(StandardScaler(RFE(input_matrix, criterion=entropy, max_features=0.55, n_estimators=100, step=1.0))), criterion=gini, max_depth=3, min_samples_leaf=5, min_samples_split=13), C=0.01, dual=True, loss=hinge, penalty=l2, tol=0.01)
Train score for ExecutionConfiguration.FLAIR: 1.0
Test score for ExecutionConfiguration.FLAIR: 0.7649122807017544



[2022-08-03 04:49:48,168] Successfully selected [1m830[22m runs from a total of [1m166[22m scans belonging to [1m166[22m subjects.
[2022-08-03 04:49:48,169] [44mFiltering 1518 recon-all results[0m
[2022-08-03 04:49:48,171] [1m303[22m/1518 recon-all results matching [96m['FLAIR + MPRAGE + 3T'][0m detected.
[2022-08-03 04:49:48,175] Successfully selected [1m132[22m/[1m303[22m runs from a total of [1m132[22m scans belonging to [1m132[22m subjects.
[2022-08-03 04:49:48,177] [1m[92m132[0m[22m/1518 recon-all runs selected.
[2022-08-03 04:49:48,180] [44mFiltering 1518

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: XGBClassifier(MaxAbsScaler(RobustScaler(VarianceThreshold(VarianceThreshold(input_matrix, threshold=0.001), threshold=0.0005))), learning_rate=0.1, max_depth=10, min_child_weight=5, n_estimators=100, n_jobs=1, subsample=0.6500000000000001, verbosity=0)
Train score for ExecutionConfiguration.MPRAGE_AND_3T_AND_FLAIR: 1.0
Test score for ExecutionConfiguration.MPRAGE_AND_3T_AND_FLAIR: 0.7298245614035088



[2022-08-03 10:53:09,954] Successfully selected [1m830[22m runs from a total of [1m166[22m scans belonging to [1m166[22m subjects.
[2022-08-03 10:53:09,957] [44mFiltering 1518 recon-all results[0m
[2022-08-03 10:53:09,959] [1m303[22m/1518 recon-all results matching [96m['T2 + MPRAGE + 3T'][0m detected.
[2022-08-03 10:53:09,963] Successfully selected [1m132[22m/[1m303[22m runs from a total of [1m132[22m scans belonging to [1m132[22m subjects.
[2022-08-03 10:53:09,965] [1m[92m132[0m[22m/1518 recon-all runs selected.
[2022-08-03 10:53:09,967] [44mFil

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]

[2022-08-03 12:51:13,736] Code block execution exceeded 600 seconds timeout
Traceback (most recent call last):
  File "/home/zvi/Projects/brainprint/venv/lib/python3.9/site-packages/stopit/utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "/home/zvi/Projects/tpot/tpot/decorators.py", line 57, in time_limited_call
    func(*args)
  File "/home/zvi/Projects/brainprint/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/zvi/Projects/brainprint/venv/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
    trees = Parallel(
  File "/home/zvi/Projects/brainprint/venv/lib/python3.9/site-packages/joblib/parallel.py", line 1046, in __call__
    while self.dispatch_one_batch(iterator):
  File "/home/zvi/Projects/brainprint/venv/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/zvi/Projec