In [1]:
import sys
from pathlib import Path

from sklearn.multioutput import ClassifierChain

main_path = Path(r'C:\Users\Richard\Desktop\ABCD_Study\Publication\abcd_paper')
sys.path.append(str(main_path))

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import src.data.preprocess_data as prep
from src.data.data_loader import RepeatedStratifiedKFoldDataloader
import src.data.var_names as abcd_vars

In [None]:
class ValidationClassifierChain(ClassifierChain):
    """A classifier chain that can be fit to a training and validation set"""
    
    def fit(self, trainX: pd.DataFrame, trainY: ,validation: pd.DataFrame):
        """Fit the model to training set and perform validation.
        Parameters
        ----------
        train : pd.DataFrame
            Training set.
        validation : pd.DataFrame
            Validation set.
        Returns
        -------
        self : object
        """

        #random_state = check_random_state(self.random_state)
        #check_array(X, accept_sparse=True)
        #self.order_ = self.order
        #if isinstance(self.order_, tuple):
        #    self.order_ = np.array(self.order_)

        #if self.order_ is None:
        #    self.order_ = np.array(range(Y.shape[1]))
        #elif isinstance(self.order_, str):
        #    if self.order_ == 'random':
        #        self.order_ = random_state.permutation(Y.shape[1])
        #elif sorted(self.order_) != list(range(Y.shape[1])):
        #    raise ValueError("invalid order")

        self.estimators_ = [clone(self.base_estimator)
                            for _ in range(trainY.shape[1])]

        if self.cv is None:
            Y_pred_chain = trainY[:, self.order_]
            if sp.issparse(X):
                #X_aug = sp.hstack((X, Y_pred_chain), format='lil')
                #X_aug = X_aug.tocsr()
            else:
                X_aug = np.hstack((trainX, Y_pred_chain))

        elif sp.issparse(X):
            #Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
            #X_aug = sp.hstack((X, Y_pred_chain), format='lil')

        else:
            Y_pred_chain = np.zeros((X.shape[0], Y.shape[1]))
            X_aug = np.hstack((X, Y_pred_chain))

        del Y_pred_chain

        for chain_idx, estimator in enumerate(self.estimators_):
            y = Y[:, self.order_[chain_idx]]
            estimator.fit(X_aug[:, :(X.shape[1] + chain_idx)], y,
                          **fit_params)
            if self.cv is not None and chain_idx < len(self.estimators_) - 1:
                col_idx = X.shape[1] + chain_idx
                cv_result = cross_val_predict(
                    self.base_estimator, X_aug[:, :col_idx],
                    y=y, cv=self.cv)
                if sp.issparse(X_aug):
                    X_aug[:, col_idx] = np.expand_dims(cv_result, 1)
                else:
                    X_aug[:, col_idx] = cv_result

        return self

class ClassifierChainEnsemble:
    
    def __init__(self,
                 num_chains:int=10):
        logistic_regression = LogisticRegression(
            random_state=0, solver='lbfgs', max_iter = 500, class_weight='balanced'
        )
        self.chains = [
            ValidationClassifierChain(
                logistic_regression, order='random', random_state=i
            ) for i in range(num_chains)
        ]
        
    def fit(self,
            train: pd.DataFrame,
            validation: pd.DataFrame):
        for chain in self.chains:
            chain.fit(train, validation)
            
    def predict(self,
                df: pd.DataFrame):
        predictions = np.array([chain.predict_proba(df) for chain in self.chains])
        y_pred_ensemble = predictions.mean(axis = 0)
        return y_pred_ensemble

In [2]:
total_df = prep.load_complete_df(main_path / 'data' / 'raw')
total_df = prep.select_one_child_per_family(
    abcd_data_path = main_path / 'data' / 'raw',
    abcd_df = total_df,
    random_state = 0
)
total_df.shape

(7188, 304)

In [3]:
data_loader = RepeatedStratifiedKFoldDataloader(
    dataframe = total_df,
    features = abcd_vars.all_brain_features.features,
    responses = abcd_vars.diagnoses.features,
    confounders = abcd_vars.sociodem.features,
    n = 1,
    k = 5,
    val_ratio = 0.2
)

In [None]:
roc_auc_values = []

for i, (train_set, validation_set, test_set, selected_features) in \
    enumerate(data_loader):
    
    cce = ClassifierChainEnsemble(
        features = selected_features,
        responses = RESPONSES,
        num_chains = 10
    )
    cce.fit(
        train = train_set,
        validation = validation_set
    )
    y_pred_test = cce.predict(test_set)
    
    # TODO: Save data: (src_subject_id, RESPONSES) from test_set and y_pred_test
    
    evaluator = MultilabelBinaryEvaluator(
        y_true = test_set[RESPONSES], y_pred = y_pred_test
    )
    roc_auc_values.append(evaluator.roc_auc())