# Step 0: Import libraries

In [1]:
#importing all necessary libraries for disease classification

import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.patheffects as PathEffects
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
#from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from copy import deepcopy
#from tqdm import tqdm

# Step 1: Read Data

In [48]:
#test read of csv file, read the head (first 5 rows)

df = pd.read_csv('C:\\Users\\aryaa\\data_edited_final.csv', encoding='utf-8')
df.head()

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,AB.F.gamma.j.Cz,AB.F.gamma.k.C4,AB.F.gamma.l.T4,AB.F.gamma.m.T5,AB.F.gamma.n.P3,AB.F.gamma.o.Pz,AB.F.gamma.p.P4,AB.F.gamma.q.T6,AB.F.gamma.r.O1,AB.F.gamma.s.O2
0,1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,35.192133,21.092034,...,1.890631,1.668558,1.376287,1.412812,1.611964,1.462566,1.461799,1.303006,1.500499,1.708294
1,2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,12.934426,10.559123,...,0.834433,0.861913,0.389658,0.860961,1.153993,1.287901,1.325232,1.06319,1.041001,3.031643
2,3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.206721,26.839849,...,1.020603,1.596309,1.416415,1.056622,1.567724,1.317114,1.264845,1.188653,1.321833,1.36655
3,4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,20.8741,21.219616,...,1.038285,1.045751,2.021776,1.554777,1.069743,0.974708,1.054286,1.329352,1.304956,1.437125
4,5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,36.949466,32.828649,...,1.113745,2.209504,3.424799,1.354361,0.945465,1.191835,2.238692,2.196915,2.022587,3.624302


Renaming the Columns for Clarity:
- XX.X.band.x.channel to band.channel
- COH.X.band.x.channel1.x.channel2 to COH.band.channel1.channel2

In [49]:
#reformat the table to have more clarity in names
def reformat_name(name):
    '''
    Reformat from XX.X.band.x.channel to band.channel or 
    COH.X.band.x.channel1.x.channel2 to COH.band.channel1.channel2
    '''
    splitted = name.split('.') #split column name
    if len(splitted) < 5:
        return name #not what we are looking for
    if splitted[0] != 'COH':
        return f'{splitted[2]}.{splitted[4]}' #reformat XX.X... label
    else:
        return '.'.join(splitted[i] for i in [0, 2, 4, 6]) #reformat COH... label

# Rename columns
df.columns = df.columns.map(reformat_name)
df


Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,delta.FP1,delta.FP2,...,gamma.Cz,gamma.C4,gamma.T4,gamma.T5,gamma.P3,gamma.Pz,gamma.P4,gamma.T6,gamma.O1,gamma.O2
0,1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,35.192133,21.092034,...,1.890631,1.668558,1.376287,1.412812,1.611964,1.462566,1.461799,1.303006,1.500499,1.708294
1,2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,12.934426,10.559123,...,0.834433,0.861913,0.389658,0.860961,1.153993,1.287901,1.325232,1.063190,1.041001,3.031643
2,3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.206721,26.839849,...,1.020603,1.596309,1.416415,1.056622,1.567724,1.317114,1.264845,1.188653,1.321833,1.366550
3,4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,20.874100,21.219616,...,1.038285,1.045751,2.021776,1.554777,1.069743,0.974708,1.054286,1.329352,1.304956,1.437125
4,5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,36.949466,32.828649,...,1.113745,2.209504,3.424799,1.354361,0.945465,1.191835,2.238692,2.196915,2.022587,3.624302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,22.0,2014.8.28,13.0,116.0,Healthy control,Healthy control,40.981948,35.956411,...,1.864057,1.666520,1.203566,2.323987,2.024301,1.631503,1.615084,2.457607,2.416692,5.421998
941,942,M,26.0,2014.9.19,13.0,118.0,Healthy control,Healthy control,18.402444,18.810580,...,2.581680,2.565274,2.529780,2.442812,2.926966,3.033085,3.115095,2.527747,3.250101,2.956166
942,943,M,26.0,2014.9.27,16.0,113.0,Healthy control,Healthy control,28.060732,31.604773,...,3.126649,2.320550,1.412219,0.774176,1.637416,1.517905,1.551123,1.136671,1.521568,1.085458
943,944,M,24.0,2014.9.20,13.0,107.0,Healthy control,Healthy control,19.330250,24.522453,...,1.749772,1.583983,3.515968,1.342409,3.021109,1.537497,1.788204,1.359925,1.720462,1.218808


Fixing a typo in specific.disorder:

Obsessive 'compulsitive' disorder to Obsessive compulsive disorder.

In [50]:
typo_ind = df[df['specific.disorder'] == 'Obsessive compulsitve disorder'].index
df.loc[typo_ind, 'specific.disorder'] = 'Obsessive compulsive disorder'

# Step 2: Resolving Missing data

Extract "separation" column between PSD and FC data.

In [51]:
#code is identifying a column (if any) in the DataFrame df where all values are missing and storing names in the variable sep_col
missing = df.isna().sum()
#print(missing[missing == df.shape[0]])
#sep_col = missing[missing == df.shape[0]].index[0]
#print(sep_col)
#sep_col

In [52]:
missing[missing > 0] #in which columns, how much data is missing?

education    15
IQ           13
dtype: int64

In [53]:
educ_na = df[df['education'].isna()] #find rows where 'education' is N/A
iq_na = df[df['IQ'].isna()] #find rows where IQ is N/A
educ_iq_na = pd.concat([educ_na, iq_na]).drop_duplicates() #combine and drop duplicates, then print
educ_iq_na

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,delta.FP1,delta.FP2,...,gamma.Cz,gamma.C4,gamma.T4,gamma.T5,gamma.P3,gamma.Pz,gamma.P4,gamma.T6,gamma.O1,gamma.O2
0,1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,35.192133,21.092034,...,1.890631,1.668558,1.376287,1.412812,1.611964,1.462566,1.461799,1.303006,1.500499,1.708294
17,18,M,30.0,2013.9.27,,86.0,Addictive disorder,Alcohol use disorder,11.971,12.030309,...,0.865807,1.09489,0.88784,0.696181,0.703831,0.742464,0.736746,0.731848,0.695234,0.721103
21,22,M,20.0,2014.10.23,,116.0,Addictive disorder,Alcohol use disorder,27.572857,21.776925,...,2.227335,1.76023,1.478228,1.634994,2.09732,2.089506,2.33694,2.836769,7.604057,2.337884
155,156,M,18.0,2013.12.7,,103.0,Addictive disorder,Behavioral addiction disorder,16.003504,18.456909,...,0.189279,0.136433,0.078477,0.103476,0.159791,0.218304,0.113193,0.07214,0.109601,0.079335
173,174,M,23.0,2015.11.14,,113.0,Healthy control,Healthy control,18.464765,22.629896,...,3.092438,2.901582,2.379274,3.049556,3.168234,3.094052,2.986871,2.81164,4.288419,3.691071
269,270,M,25.0,2015.9.23,,85.0,Obsessive compulsive disorder,Obsessive compulsive disorder,5.854639,6.756923,...,2.624944,2.085884,1.649951,0.484128,0.916479,1.088369,0.831028,0.663967,0.953392,0.909372
270,271,M,34.0,2015.9.21,,120.0,Obsessive compulsive disorder,Obsessive compulsive disorder,12.306134,15.388163,...,0.819246,0.62565,0.489418,0.445003,0.627814,0.729857,0.927179,0.90746,1.020015,2.964852
279,280,M,35.0,2016.6.2,,,Obsessive compulsive disorder,Obsessive compulsive disorder,20.902034,21.594917,...,1.208928,1.141186,1.036164,1.681099,1.120896,1.018877,0.94086,0.966858,2.401453,1.288047
280,281,M,37.0,2016.6.27,,110.0,Obsessive compulsive disorder,Obsessive compulsive disorder,11.507979,10.923164,...,2.628747,5.54713,2.187832,1.432453,2.244837,2.400614,2.444512,1.98514,3.346272,2.535555
281,282,M,22.0,2016.6.30,,107.0,Obsessive compulsive disorder,Obsessive compulsive disorder,12.042707,9.710469,...,0.171675,0.372893,0.250771,0.077557,0.104466,0.124866,0.140702,0.110034,0.104334,0.122191


In [55]:
#code provides a comparison of missing value percentages across different categories of the 'main.disorder' 
#column between the entire dataset and the subset where either 'education' or 'IQ' is missing.

drop_md = educ_iq_na['main.disorder'].value_counts().sort_index() # Calculate the frequency count of each category in the 'main.disorder' column for missing education data

# Calculate the frequency count of each category in the 'main.disorder' column for all data
# Select only those categories present in the missing data
all_md = df['main.disorder'].value_counts().sort_index()[drop_md.index]

# Concatenate the frequency counts of all data and missing data
# Calculate the percentage of missing values for each category
pd.concat([all_md, drop_md/all_md * 100], axis=1).set_axis(['all_data', 'na_percentage'], axis=1).sort_values('na_percentage', ascending=False)

Unnamed: 0_level_0,all_data,na_percentage
main.disorder,Unnamed: 1_level_1,Unnamed: 2_level_1
Obsessive compulsive disorder,46,13.043478
Addictive disorder,186,4.301075
Trauma and stress related disorder,128,3.90625
Healthy control,95,2.105263
Mood disorder,266,1.503759
Anxiety disorder,107,0.934579


We will lose too much data (13%) on patients with obsessive compulsive disorder, which is one of the smallest groups already. Hence, we should consider filling missing data.  
Our options to fill them are:
- special value
- mean/median value
- imputation

Therefore, fill in missing data with **median values**.

In [56]:
# Display the mean and median values for the 'education' and 'IQ' columns in the DataFrame 'df'
display(df[['education', 'IQ']].agg(['mean', 'median']))

# Create an instance of SimpleImputer with strategy='median'
imputer = SimpleImputer(strategy='median')

Unnamed: 0,education,IQ
mean,13.43871,101.580472
median,13.0,102.0


# Step 3: Pre-processing
Preparing Data for classification:

1. Drop separation column, id colums (no. and eeg.date)
2. Encode target (main.disorder and specific.disorder) and categorical (sex) varables 
3. Drop target variables from features
4. Fill missing data in education and IQ columns
5. Apply log transformation to Age, PSD and FC features
6. Separate features and targets into separate subset for binary classification (disorder vs. healthy control)
7. Scale features

In [57]:
# Step 1: Drop unnecessary columns
X = df.drop(['no.', 'eeg.date'], axis=1).copy(deep=True)

# Step 2: Define target and categorical columns
target_col = ['main.disorder', 'specific.disorder'] #target cols for clasification
cat_col = ['sex', 'main.disorder','specific.disorder'] #categorical columns (not numbers)

# Identify unique disorders
md_unique = df['main.disorder'].unique()
sd_unique = df['specific.disorder'].unique()

# Ensure 'Healthy control' is encoded as zero
hc = 'Healthy control'

# Step 2: Encode categorical variables
#Filtering out the 'Healthy control' category from the unique disorders, storing only the disorders of interest.
md = md_unique[md_unique != hc]
sd = sd_unique[sd_unique != hc]
md_ord = np.insert(md, 0, hc) #encoding 'healthy control' as 0
sd_ord = np.insert(sd, 0, hc)
sex_ord = df['sex'].unique()

#transforming 'X' df to extract targets, filling in missing values and transforming certain values with log-transformation.
#'X' will be all the features, while md/sd_target are the targets to train/test models.

# encoder instantiation for targets and sex
enc = OrdinalEncoder(categories=[sex_ord, md_ord, sd_ord])
X[cat_col] = enc.fit_transform(X[cat_col])

# save targets
md_target = X['main.disorder']
sd_target = X['specific.disorder']

# Step 3: Drop targets
X.drop(target_col, axis=1, inplace=True)

# Step 4: Fill in blanks with median
mv_cols = ['education', 'IQ']
X[mv_cols] = imputer.fit_transform(X[mv_cols])

# Step 5: log-transformation
logtrans_cols = X.drop(['sex', 'education', 'IQ'], axis=1).columns
X[logtrans_cols] = np.log(X[logtrans_cols])

In [58]:
#Step 6: Separate to binary targets
def sep_to_bin(features, target, target_ord, disorders, hc_id=0):
    assert len(features) == len(target)
    X = dict()
    Y = dict()
    for disorder in disorders:
        # find how target was coded
        disorder_id = np.where(target_ord == disorder)[0][0]
        # extract targets
        y = target[target.isin([hc_id, disorder_id])]
        y[y != hc_id] = 1
        # features
        x = features.loc[y.index]
        # save
        X[disorder] = x
        Y[disorder] = y
    return X, Y

#validate feature division is correct
def min_validation(features, target_name, disorders, hc_name='Healthy control'):
    # count number of rows per each target value
    counts = df[target_name].value_counts()
    # initialize result
    result = np.zeros(len(disorders), dtype='bool')
    for i, disorder in enumerate(disorders):
        # calculate size of dataset: disorder + healthy control
        hc_disorder_count = counts[hc_name] + counts[disorder]
        # update result
        result[i] = len(features[disorder]) == hc_disorder_count
    return result

# separate to binary classification subsets
Xmd, Ymd = sep_to_bin(X, md_target, md_ord, md)
Xsd, Ysd = sep_to_bin(X, sd_target, sd_ord, sd)

# validate length of subsets
assert min_validation(Xmd, 'main.disorder', md).all()
assert min_validation(Xsd, 'specific.disorder', sd).all()

# Step 4: Model Classification
What algorithm will work best?  
We will use cross-validation with 10 folds to choose the best algorithm for classification for each disorder.

Algorithms to consider:
- Logistic Regression, ElasticNet Penalty
- XGBoost

In [97]:
def lists_to_dict(keys, items):
    return {key: item for key, item in zip(keys, items)}

# Models names
model_names = ['XGB', 'LogisticRegression']

# Param grids for gridsearch
param_grids = [
    {
        'n_estimators': [100, 300, 500],
        'subsample': [0.3, 0.5, 1],
        'max_depth': [1, 3, 6, None]
    },
    {
        'l1_ratio': np.linspace(0, 1, 5),
        'C': [0.5, 1, 5, 10]
    }
]

param_grids = lists_to_dict(model_names, param_grids)

# Models for gridsearch
models = [
    XGBClassifier(),  # Tree method is set to 'auto' by default
    LogisticRegression(penalty='elasticnet', solver='saga')
]

models = lists_to_dict(model_names, models)


In [98]:
def n_best(gs_res, n=1):
    """Returns nth best estimator parameters, mean score and std of it"""
    # Find index of nth best estimator
    ind = np.where(gs_res['rank_test_score'] == n)[0][0]
    # Extract mean score and std
    mu = gs_res["mean_test_score"][ind]
    std = gs_res["std_test_score"][ind]
    # Extract parameters
    params = gs_res["params"][ind]
    return params, mu, std

def cache_mkdir(cache, directory, root_dir='.'):
    """Create directory and return path to it"""
    # Replace spaces with underscores
    directory = directory.replace(' ', '_')
    # Define path to directory
    path = os.path.join(root_dir, directory)
    # Create directory if it doesn't exist
    if cache and not os.path.exists(path):
        os.makedirs(path)
    return path

def read_cache(cache, path, silent=False):
    """Read from cache file"""
    result = None
    # Check if cache is enabled and cache file exists
    if cache and os.path.exists(path):
        with open(path, 'rb') as file:
            # Load cached object
            if not silent:
                print(f'Extracted from cache ({path})')
            result = pickle.load(file)
    return result

def write_cache(cache, obj, path):
    """Write to cache"""
    if cache:
        # Write object to cache file
        with open(path, 'wb') as file:
            pickle.dump(obj, file)

def grid_search(disorders, models, params, Xs, Ys, random_seed=None, cache=True, cache_dir='grid_search', feature_list=None, silent=False):
    # Result list to store grid search results
    result = []
    # Set feature list if None
    if feature_list is None:
        feature_list = X.columns
    # Create cache directory if needed
    cache_mkdir(cache, cache_dir)
    for disorder in disorders:
        # Get and create (if needed) a disorder cache directory
        disorder_folder = cache_mkdir(cache, disorder, cache_dir)
        for key in models.keys():
            print(disorder, key)
            # Get cache file path
            cache_file = os.path.join(disorder_folder, key)
            # Read file from cache
            res = read_cache(cache, cache_file, silent)
            if res is None:
                res = {}
                res['disorder'] = disorder
                res['algorithm'] = key
                # Set seed if provided
                if random_seed is not None:
                    np.random.seed(random_seed)
                # Step 7: Scale features
                scaler = StandardScaler()
                x = scaler.fit_transform(Xs[disorder][feature_list])
                # Grid search
                gs = GridSearchCV(models[key], params[key], cv=10, scoring='roc_auc', n_jobs=-1, verbose=1).fit(x, Ys[disorder])
                # Best estimator parameters, cross validation mean score and score std
                res['params'], res['mean_score'], res['std_score'] = n_best(gs.cv_results_)
                # Save cv_results_
                res['cv_result'] = deepcopy(gs.cv_results_)
                # Cache results
                write_cache(cache, res, cache_file)
            result.append(res)
    return result


In [99]:
import warnings
warnings.filterwarnings('ignore')

## Main disorder: Cell below runs 2 models

In [100]:
md_res_dict = grid_search(md, models, param_grids, Xmd, Ymd, 77)

Addictive disorder XGB
Extracted from cache (grid_search\Addictive_disorder\XGB)
Addictive disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Trauma and stress related disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Trauma and stress related disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Mood disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Mood disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Obsessive compulsive disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Obsessive compulsive disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Schizophrenia XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Schizophrenia LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Anxiety disorder XGB
Fitting 10 fold

In [101]:
md_results = pd.DataFrame(md_res_dict)
md_results

Unnamed: 0,disorder,algorithm,params,mean_score,std_score,cv_result
0,Addictive disorder,XGB,"{'max_depth': 1, 'n_estimators': 100, 'subsamp...",0.79348,0.101246,"{'mean_fit_time': [0.23733384609222413, 0.1881..."
1,Addictive disorder,LogisticRegression,"{'C': 0.5, 'l1_ratio': 1.0}",0.817456,0.1287,"{'mean_fit_time': [0.06900274753570557, 0.1100..."
2,Trauma and stress related disorder,XGB,"{'max_depth': 1, 'n_estimators': 100, 'subsamp...",0.892158,0.068392,"{'mean_fit_time': [0.20683982372283935, 0.1473..."
3,Trauma and stress related disorder,LogisticRegression,"{'C': 0.5, 'l1_ratio': 0.5}",0.908162,0.065336,"{'mean_fit_time': [0.06260802745819091, 0.0988..."
4,Mood disorder,XGB,"{'max_depth': 3, 'n_estimators': 100, 'subsamp...",0.827243,0.105897,"{'mean_fit_time': [0.23092033863067626, 0.1955..."
5,Mood disorder,LogisticRegression,"{'C': 1, 'l1_ratio': 0.5}",0.845497,0.097681,"{'mean_fit_time': [0.11011800765991211, 0.1565..."
6,Obsessive compulsive disorder,XGB,"{'max_depth': 1, 'n_estimators': 500, 'subsamp...",0.631778,0.166071,"{'mean_fit_time': [0.11675353050231933, 0.1268..."
7,Obsessive compulsive disorder,LogisticRegression,"{'C': 1, 'l1_ratio': 0.5}",0.627333,0.101609,"{'mean_fit_time': [0.05152659416198731, 0.0945..."
8,Schizophrenia,XGB,"{'max_depth': 1, 'n_estimators': 100, 'subsamp...",0.94197,0.065561,"{'mean_fit_time': [0.11791775226593018, 0.1442..."
9,Schizophrenia,LogisticRegression,"{'C': 0.5, 'l1_ratio': 1.0}",0.935816,0.063869,"{'mean_fit_time': [0.06112837791442871, 0.1036..."


In [102]:
def combine (x, y):
    return x.astype(str) + ' (' + y.astype(str) + ')'
def report_scores (df, index='disorder', columns='algorithm'):
    df['mean_std'] = combine(df['mean_score'].round(2), df['std_score'].round(2))
    return df.pivot_table(values=['mean_std'], columns=columns, index=index, aggfunc=lambda x: ' '.join(x))

In [104]:
report_scores(md_results) #ROC AUC ratings (std dev)

Unnamed: 0_level_0,mean_std,mean_std
algorithm,LogisticRegression,XGB
disorder,Unnamed: 1_level_2,Unnamed: 2_level_2
Addictive disorder,0.82 (0.13),0.79 (0.1)
Anxiety disorder,0.92 (0.07),0.9 (0.08)
Mood disorder,0.85 (0.1),0.83 (0.11)
Obsessive compulsive disorder,0.63 (0.1),0.63 (0.17)
Schizophrenia,0.94 (0.06),0.94 (0.07)
Trauma and stress related disorder,0.91 (0.07),0.89 (0.07)


## Specific disorder

In [105]:
sd_res_dict = grid_search(sd, models, param_grids, Xsd, Ysd, 77)

Alcohol use disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Alcohol use disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Acute stress disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Acute stress disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Depressive disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Depressive disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Behavioral addiction disorder XGB
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Behavioral addiction disorder LogisticRegression
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Obsessive compulsive disorder XGB
Extracted from cache (grid_search\Obsessive_compulsive_disorder\XGB)
Obsessive compulsive disorder LogisticRegression
Extracted from cache (grid_search\Obsessive_compulsive_disorder\Lo

In [106]:
sd_results = pd.DataFrame(sd_res_dict)
sd_results

Unnamed: 0,disorder,algorithm,params,mean_score,std_score,cv_result
0,Alcohol use disorder,XGB,"{'max_depth': 1, 'n_estimators': 100, 'subsamp...",0.880864,0.112452,"{'mean_fit_time': [0.09269309043884277, 0.1226..."
1,Alcohol use disorder,LogisticRegression,"{'C': 5, 'l1_ratio': 0.0}",0.925432,0.063328,"{'mean_fit_time': [0.05334415435791016, 0.0767..."
2,Acute stress disorder,XGB,"{'max_depth': 6, 'n_estimators': 100, 'subsamp...",0.874167,0.134383,"{'mean_fit_time': [0.05709865093231201, 0.0856..."
3,Acute stress disorder,LogisticRegression,"{'C': 0.5, 'l1_ratio': 0.5}",0.888056,0.176217,"{'mean_fit_time': [0.041493892669677734, 0.056..."
4,Depressive disorder,XGB,"{'max_depth': 1, 'n_estimators': 500, 'subsamp...",0.830342,0.104165,"{'mean_fit_time': [0.16784987449645997, 0.2085..."
5,Depressive disorder,LogisticRegression,"{'C': 1, 'l1_ratio': 0.0}",0.835079,0.095823,"{'mean_fit_time': [0.0941270112991333, 0.17028..."
6,Behavioral addiction disorder,XGB,"{'max_depth': 1, 'n_estimators': 100, 'subsamp...",0.715926,0.231179,"{'mean_fit_time': [0.11184439659118653, 0.1250..."
7,Behavioral addiction disorder,LogisticRegression,"{'C': 0.5, 'l1_ratio': 1.0}",0.754938,0.257506,"{'mean_fit_time': [0.06596841812133789, 0.1200..."
8,Obsessive compulsive disorder,XGB,"{'max_depth': 1, 'n_estimators': 500, 'subsamp...",0.631778,0.166071,"{'mean_fit_time': [0.11675353050231933, 0.1268..."
9,Obsessive compulsive disorder,LogisticRegression,"{'C': 1, 'l1_ratio': 0.5}",0.627333,0.101609,"{'mean_fit_time': [0.05152659416198731, 0.0945..."


In [107]:
report_scores(sd_results)

Unnamed: 0_level_0,mean_std,mean_std
algorithm,LogisticRegression,XGB
disorder,Unnamed: 1_level_2,Unnamed: 2_level_2
Acute stress disorder,0.89 (0.18),0.87 (0.13)
Adjustment disorder,0.92 (0.1),0.89 (0.1)
Alcohol use disorder,0.93 (0.06),0.88 (0.11)
Behavioral addiction disorder,0.75 (0.26),0.72 (0.23)
Bipolar disorder,0.85 (0.11),0.87 (0.11)
Depressive disorder,0.84 (0.1),0.83 (0.1)
Obsessive compulsive disorder,0.63 (0.1),0.63 (0.17)
Panic disorder,0.92 (0.06),0.86 (0.1)
Posttraumatic stress disorder,0.91 (0.12),0.91 (0.11)
Schizophrenia,0.94 (0.06),0.94 (0.07)
