In [3]:
import sys
from pathlib import Path

from sklearn.multioutput import ClassifierChain

main_path = Path(r'C:\Users\Richard\Desktop\ABCD_Study\Publication\abcd_paper')
sys.path.append(str(main_path))

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import src.data.preprocess_data as prep
from src.data.data_loader import RepeatedStratifiedKFoldDataloader
import src.data.var_names as abcd_vars


total_df = prep.load_complete_df(main_path / 'data' / 'raw')
total_df = prep.select_one_child_per_family(
    abcd_data_path = main_path / 'data' / 'raw',
    abcd_df = total_df,
    random_state = 0
)
print(total_df.shape)

data_loader = RepeatedStratifiedKFoldDataloader(
    dataframe = total_df,
    features = abcd_vars.all_brain_features.features,
    responses = abcd_vars.diagnoses.features,
    confounders = abcd_vars.sociodem.features,
    n = 1,
    k = 5,
    val_ratio = 0.2
)

(7188, 304)


In [4]:
train, valid, test, features_selected = next(iter(data_loader))



In [39]:
from src.models.xgboost_pipeline import DepthwiseXGBPipeline, ErrorFunctions
from src.models.evaluate import BinaryEvaluator

In [9]:
pipe = DepthwiseXGBPipeline(
    y_col = 'Bipolar Disorder',
    include_cols = features_selected,
    n_calls = 11,
    random_state = 0
)
pipe.fit(train, valid)

<src.models.xgboost_pipeline.DepthwiseXGBPipeline at 0x2166ff944c0>

### Sanity check: Has the model fit to training set?

In [31]:
y_pred = pipe.predict(test)
evaluator = BinaryEvaluator(test['Bipolar Disorder'], y_pred)
evaluator.roc_auc

0.5330099066411187

### Use Dummy base_estimator model

In [94]:
from src.models.classifier_chain import LogisticRegressionModel, XGBoostClassifierChain

In [73]:
model = LogisticRegressionModel(y_col = 'Bipolar Disorder', include_cols = features_selected)
model.fit(train, valid)

In [74]:
y_pred = model.predict(test)
evaluator = BinaryEvaluator(test['Bipolar Disorder'], y_pred)
evaluator.roc_auc

0.6054466142117494

In [95]:
cc = XGBoostClassifierChain(
    features = features_selected,
    responses = abcd_vars.diagnoses.features,
    error_function = ErrorFunctions.log_loss()
)

In [96]:
cc.fit(train, valid)



XGBoostClassifierChain(error_function=None,
                       features=['sri24precentrallgm', 'sri24precentralrgm',
                                 'sri24frontalsuplgm', 'sri24frontalsuprgm',
                                 'sri24frontalsuporblgm',
                                 'sri24frontalsuporbrgm', 'sri24frontalmidlgm',
                                 'sri24frontalmidrgm', 'sri24frontalmidorblgm',
                                 'sri24frontalmidorbrgm',
                                 'sri24frontalinfoperlgm',
                                 'sri24frontalinfoperrgm',
                                 'sri24frontalinftrilgm',
                                 '...
                                 'sri24frontalsupmediallgm',
                                 'sri24frontalsupmedialrgm',
                                 'sri24frontalmedorblgm',
                                 'sri24frontalmedorbrgm', 'sri24rectuslgm',
                                 'sri24rectusrgm', 'sri2

In [100]:
y_pred = cc.predict_proba(test)
evaluator = BinaryEvaluator(test['ADHD'], y_pred['ADHD'])
evaluator.roc_auc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

0.5501949765857812

In [101]:
y_pred2 = cc.predict_proba(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

In [111]:
y_pred/10

Unnamed: 0_level_0,Major Depressive Disorder,Bipolar Disorder,Psychotic Symptoms,ADHD,Oppositional Defiant Disorder,Conduct Disorder,PTSD,Obsessive Compulsive Disorder
src_subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDAR_INV00NPMHND,0.038678,0.055190,0.028017,0.027110,0.057935,0.004299,0.037633,0.052844
NDAR_INV00R4TXET,0.091701,0.046064,0.098279,0.054389,0.099295,0.097225,0.086653,0.098277
NDAR_INV014RTM1V,0.093340,0.072208,0.099885,0.052810,0.095463,0.054420,0.026795,0.095278
NDAR_INV02JXJUZG,0.042395,0.027325,0.009648,0.033859,0.008256,0.000750,0.009858,0.018041
NDAR_INV03BDCNWM,0.082867,0.070847,0.091400,0.067885,0.099882,0.084685,0.064378,0.089727
...,...,...,...,...,...,...,...,...
NDAR_INVZTTGYL51,0.091228,0.053997,0.099519,0.048456,0.091205,0.099041,0.059678,0.086971
NDAR_INVZU422XFY,0.029521,0.014580,0.000089,0.033592,0.022341,0.009001,0.021482,0.037063
NDAR_INVZXC2YRV3,0.064030,0.085772,0.041631,0.074655,0.089015,0.094974,0.004400,0.093341
NDAR_INVZXF5C635,0.099187,0.077842,0.099888,0.055268,0.099768,0.099917,0.097956,0.095357


In [82]:
for e in cc.estimators_:
    print([v for v in e._include_cols if v not in abcd_vars.all_brain_features.features])

[]
['ADHD']
['ADHD', 'Bipolar Disorder']
['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms']
['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms', 'Conduct Disorder']
['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms', 'Conduct Disorder', 'PTSD']
['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms', 'Conduct Disorder', 'PTSD', 'Major Depressive Disorder']
['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms', 'Conduct Disorder', 'PTSD', 'Major Depressive Disorder', 'Oppositional Defiant Disorder']


In [84]:
cc.responses_

array(['ADHD', 'Bipolar Disorder', 'Psychotic Symptoms',
       'Conduct Disorder', 'PTSD', 'Major Depressive Disorder',
       'Oppositional Defiant Disorder', 'Obsessive Compulsive Disorder'],
      dtype='<U29')

In [None]:
data_ = data[self.features]
predictions_df = pd.DataFrame(columns=[], index=data.index)
for chain_idx, (estimator, response) in \
        enumerate(zip(cc.estimators_, cc.responses_)):
    estimator
    #predictions_df[response] = estimator.predict(data_)
    # Predict new labels based on threshold predictions. Set threshold
    # to 0.5 arbitrarily.
    #data_[response] = predictions_df[response] > .5

In [70]:
a

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [71]:
a.loc[:, '001'] = [33, 77]
a

Unnamed: 0,a,b,c,001
0,1,2,3,33
1,4,5,6,77


### Classifier Chain Ensemble

In [112]:
from src.models.classifier_chain import ClassifierChainEnsemble

cce = ClassifierChainEnsemble(
    features = features_selected,
    responses = abcd_vars.diagnoses.features,
    num_chains = 10
)

In [113]:
cce.fit(train, valid)

In [115]:
y_pred = cce.predict(test)
evaluator = BinaryEvaluator(test['ADHD'], y_pred['ADHD'])
evaluator.roc_auc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

0.5394499787143465

In [116]:
y_pred

Unnamed: 0_level_0,Major Depressive Disorder,Bipolar Disorder,Psychotic Symptoms,ADHD,Oppositional Defiant Disorder,Conduct Disorder,PTSD,Obsessive Compulsive Disorder
src_subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDAR_INV00NPMHND,0.166438,0.500392,0.077978,0.254853,0.394870,0.028832,0.209970,0.354903
NDAR_INV00R4TXET,0.721549,0.745030,0.834234,0.900232,0.837031,0.619882,0.815816,0.714675
NDAR_INV014RTM1V,0.828228,0.801327,0.877881,0.896892,0.717972,0.385991,0.528537,0.755877
NDAR_INV02JXJUZG,0.424345,0.286904,0.186483,0.385892,0.177370,0.022704,0.069808,0.322026
NDAR_INV03BDCNWM,0.307098,0.653680,0.132745,0.857279,0.856318,0.132043,0.252017,0.318466
...,...,...,...,...,...,...,...,...
NDAR_INVZTTGYL51,0.756831,0.754538,0.553972,0.876307,0.799658,0.781630,0.863771,0.673843
NDAR_INVZU422XFY,0.684802,0.580138,0.200429,0.794569,0.827654,0.751614,0.765139,0.687808
NDAR_INVZXC2YRV3,0.510244,0.902457,0.180837,0.916730,0.707327,0.501060,0.350972,0.694178
NDAR_INVZXF5C635,0.912517,0.842281,0.780551,0.971059,0.914491,0.903277,0.938646,0.661172


In [117]:
from src.models.evaluate import MultilabelBinaryEvaluator

evaluator = MultilabelBinaryEvaluator(
    test[abcd_vars.diagnoses.features],
    y_pred
)

In [118]:
evaluator.roc_auc()

{'Major Depressive Disorder': 0.5165120552859486,
 'Bipolar Disorder': 0.6350686941987035,
 'Psychotic Symptoms': 0.46721108636476116,
 'ADHD': 0.5394499787143465,
 'Oppositional Defiant Disorder': 0.5396630080603669,
 'Conduct Disorder': 0.5008142116950407,
 'PTSD': 0.5123893805309734,
 'Obsessive Compulsive Disorder': 0.5571959815729589}