In [1]:
from Model_Factory import Model_Factory
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate 
from joblib import dump, load
import cross_validation, grid_search, manage_files, argparse, shutil
import Constants as constants
from enigma_data import enigma_data
import pandas as pd
from sklearn.compose import ColumnTransformer

# Initialize input values 

+ __sdx__ := Sdx classifer (3,4,5,6,7,8)
+ __egl__ := Engel classifier (0=No or 1=Yes)
+ __hmz__ := Harmonize with neuroCombat (0=No or 1=Yes)
+ __prm__ := Permutate training labels (0=No or 1=Yes)
+ __grd__ := Peform grid-search (0=No or 1=Yes)
+ __mdl__ := Classification model (DL or SV)
+ __csv_opt__ := CSV file (all, AD, FA, MD, or RD)


In [2]:
root_folder = "/Users/Kyuyeon/Desktop/kyu_csvs/"

mf = manage_files.ManageFiles(root_folder)

sdx :3
egl :0
hmz :1
prm :0
grd :1
mdl :sv
csv_opt :all


###### Parse csv data file
------------------------------------------------------------------------------
+ Run this cell when __*egl == 0*__ 
+ data.parse() := preprocessing the data

In [3]:
data = enigma_data( dfile=mf.csv_file, predict=mf.predict,predict_val = mf.predict_val, data_opt=mf.csv_opt )
data = data.parse(balance=True)

Found classes: [3 0 4 1 2]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


+ Run this cell when __*egl == 1*__ 
+ data.parse() := preprocessing the data

In [4]:
data = data.parse_engel()

ValueError: list.remove(x): x not in list

# Partition the data 
------------------------------------------------------------------------------
+ __prm == 1__ := *random shuffle train data labels by permutating*  
+ data.partition := splits the data and target data into training set and test set (training : 0.75 rate test:0.25 rate) [Spliting Data Tutorial](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [5]:
ttv_dict = data.partition( perm_train_labels=bool( mf.prm ) )

covars_Site = pd.DataFrame(ttv_dict["Covars_train"]["Site"])
covars_Sex = pd.DataFrame(ttv_dict["Covars_train"]["Sex"])
covars_Age = pd.DataFrame(ttv_dict["Covars_train"]["Age"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Site"] = train["Site"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[self.predict] = train[self.predict].astype(int)


# Define the pipeline, cross-validation, and scoring metrics
-------------------------------------------------------------------------------------------
+ scoring := evaluating as __*accuracy*__ named as "Accuracy"
+ kfold := [StratifiedKFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html) __*uniformly distributes*__ labels into test and training data K times
+ factory := Define a __*model factory*__ object
- scaler_model := building [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) scaler model
- imbalance_model := overcoming imbalanced data by __*oversampling*__


In [6]:
scoring = {"Accuracy": "accuracy"}
kfold = StratifiedKFold( n_splits=constants.CV_FOLDS, shuffle=True)
factory = Model_Factory()

scaler_model = factory.build( model_type=constants.MODELS.SCALER )
imbalance_model = factory.build( model_type=constants.MODELS.IMBALANCE )
neurocombat_model = factory.build( model_type=constants.MODELS.NEUROCOMBAT, data=data,sites= covars_Site, discrete_covariates=covars_Sex,continuous_covariates=covars_Age)

column_trans = ColumnTransformer(
     [('scaler', scaler_model,ttv_dict["X_train"].columns.values[0:-1])],
         remainder='passthrough')

# Performing grid search to find the optimal parameter for each ML model
#### Error occurs by perfomring grid search with permutated class labels (prm ==1)
---------------------------------------------------------------------------------------
### Run this Cell For SV model
+ Building a __Support Vector Machine Model__ for *Grid-Search*

In [7]:
cls_model = factory.build( model_type=constants.MODELS.SV_CLASSIFY, C=1.0, max_iter=10000 )

### Run this Cell For DL model
+ Building a __Deep Learning Model__ for *Grid_Search*

In [None]:
cls_model = factory.build( model_type=constants.MODELS.DL_CLASSIFY, input_dimension=ttv_dict['X_train'].shape[1],
                          checkpoint_folder=mf.chpt_folder, checkpoint_file="{val_accuracy:.5f}.hdf5", 
                          learn_rate=0.1, epochs=150, batch_size=20, verbose=0 )

### Grid Search with Machine Learning Model (SV or DL)
+ [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) := The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. 
+ [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) := Process of finding the optimal parameter for given classifier model

In [8]:
pipeline = Pipeline( [('scaler', column_trans ), ('neurocombat', neurocombat_model), ('classifier', cls_model ) ], memory=None )

grid = GridSearchCV( estimator=pipeline, param_grid=cls_model.get_grid_dict(), cv=kfold, scoring=scoring, refit="Accuracy", verbose=10 )


grid_result = grid.fit( ttv_dict['X_train'],ttv_dict['Y_train'])

gdr_dict = grid_search.GridSearch( cls_model, grid_result ).gdr_dict

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.640, total=   0.1s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.560, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.600, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.720, total=   0.0s
[CV] classifier__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s


[CV] ................ classifier__C=0.1, Accuracy=0.760, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.760, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.720, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.480, total=   0.0s
[CV] classifier__C=0.1 ...............................................
[CV] ................ classifier__C=0.1, Accuracy=0.720, total=   0.0s
[CV] classifier__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.4s remaining:    0.0s


[CV] ................ classifier__C=0.1, Accuracy=0.750, total=   0.0s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.680, total=   0.1s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.720, total=   0.0s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.720, total=   0.1s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.720, total=   0.1s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.720, total=   0.1s
[CV] classifier__C=0.25 ..............................................
[CV] ............... classifier__C=0.25, Accuracy=0.800, total=   0.0s
[CV] classifier__C=0.25 ..............................................
[CV] .

[CV] ................ classifier__C=1.1, Accuracy=0.520, total=   0.1s
[CV] classifier__C=1.1 ...............................................
[CV] ................ classifier__C=1.1, Accuracy=0.680, total=   0.1s
[CV] classifier__C=1.1 ...............................................
[CV] ................ classifier__C=1.1, Accuracy=0.792, total=   0.2s
[CV] classifier__C=1.5 ...............................................
[CV] ................ classifier__C=1.5, Accuracy=0.600, total=   0.2s
[CV] classifier__C=1.5 ...............................................
[CV] ................ classifier__C=1.5, Accuracy=0.720, total=   0.2s
[CV] classifier__C=1.5 ...............................................
[CV] ................ classifier__C=1.5, Accuracy=0.680, total=   0.1s
[CV] classifier__C=1.5 ...............................................
[CV] ................ classifier__C=1.5, Accuracy=0.680, total=   0.2s
[CV] classifier__C=1.5 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    8.6s finished


### Dump gdr_dict into grid_result_file
--------------------------------------------------------
+ grid_result_file stores the optimal parameters from grid search with cross validation
+ __gdr_dict__ := storing the optimal parameters from grid searching

In [9]:
grid.best_score_          

0.695

In [None]:
dump( gdr_dict, mf.grid_result_file ) 

load ( mf.grid_result_file )

# Performing cross-validation when optimal parameters already exists
--------------------------------------------------------------------------------------
### Load an existing file storing the optimal parameter

In [None]:
gdr_dict = load( mf.grid_result_file )

### Run this cell for SV model
+ Building a __Support Vector Machine Model__ for *Cross-Validation*

In [None]:
cls_model = factory.build( model_type=constants.MODELS.SV_CLASSIFY, C=gdr_dict['classifier__C'], max_iter=10000 )

### Run this cell for DL model
+ Building a __Deep Learning Model__ for *Cross-Validation*

In [None]:
cls_model = factory.build( model_type=constants.MODELS.DL_CLASSIFY, input_dimension=ttv_dict['X_train'].shape[1], checkpoint_folder=mf.chpt_folder, checkpoint_file="{val_accuracy:.5f}.hdf5", learn_rate=gdr_dict['classifier__learn_rate'], hidden_units_L1=gdr_dict['classifier__hidden_units_L1'], hidden_units_L2=gdr_dict['classifier__hidden_units_L2'], l2_reg_penalty=gdr_dict['classifier__l2_reg_penalty'], drop_out_rate=gdr_dict['classifier__drop_out_rate'], epochs=gdr_dict['classifier__epochs'], batch_size=20, verbose=0 )

### Cross-Validation with Machine Learning Model (SV or DL)
+ [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) := The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. 
+ [cross_validate](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html) := performing cross validation kfold times based on classifider model that defined on Pipeline function.

In [None]:
pipeline = Pipeline( [ ('scaler', column_trans ),('neurocombat',neurocombat_model ),('classifier', cls_model ) ], memory=None )

cv_result = cross_validate( pipeline, ttv_dict['X_train'], ttv_dict['Y_train'], cv=kfold, scoring=scoring, return_estimator=True, verbose=2 )

result_dict = {}

result_dict['features'] = ttv_dict['X_train'].columns

result_dict["iteration"] = cross_validation.CrossValidation(ttv_dict, cv_result, constants.CV_FOLDS).iteration

dump( result_dict, mf.arch_file )  