In [1]:
import os
import numpy as np
import pandas as pd
import sys
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedGroupKFold, cross_validate, StratifiedKFold, LeaveOneOut
from sklearn.metrics import make_scorer, roc_auc_score, balanced_accuracy_score, accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from copy import deepcopy
import itertools
import argparse
from os import path as op
from glob import glob
from joblib import Parallel, delayed

# add path to classification analysis functions
from mixed_sigmoid_normalisation import MixedSigmoidScaler


bids_root = "/headnode1/abry4213/data/Cogitate_MEG"
n_jobs = 1
subject_ID = "CA103"
classification_type = "averaged"
classifier = "Logistic_Regression"

# Load data paths
pyspi_res_path = f"{bids_root}/derivatives/time_series_features"
pyspi_res_path_averaged = f"{pyspi_res_path}/averaged_epochs"
pyspi_res_path_individual = f"{pyspi_res_path}/individual_epochs"

classification_res_path = f"{bids_root}/derivatives/classification_results"
classification_res_path_averaged = f"{classification_res_path}/across_participants"
classification_res_path_individual = f"{classification_res_path}/within_participants"

# Make classification result directories
os.makedirs(classification_res_path_averaged, exist_ok=True)
os.makedirs(classification_res_path_individual, exist_ok=True)

# Define classifier
if classifier == "Linear_SVM":
    model = svm.SVC(C=1, class_weight='balanced', kernel='linear', random_state=127, probability=True)
else:
    model = LogisticRegression(penalty='l1', C=1, solver='liblinear', class_weight='balanced', random_state=127)

pipe = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)), 
                            ('model', model)])

# Define scoring type
scoring = {'accuracy': 'accuracy',
           'balanced_accuracy': 'balanced_accuracy',
           'AUC': make_scorer(roc_auc_score, needs_proba=True)}

# Defiene cross-validators
LOOCV = LeaveOneOut()
SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)



In [2]:
# Load in pyspi results
all_pyspi_res_list = []
for pyspi_res_file in glob(f"{pyspi_res_path_averaged}/*all_pyspi_results_1000ms.csv"):
    pyspi_res = pd.read_csv(pyspi_res_file)
    # Reset index
    pyspi_res.reset_index(inplace=True, drop=True)
    pyspi_res['stimulus_type'] = pyspi_res['stimulus_type'].replace(False, 'false').replace('False', 'false')
    pyspi_res['relevance_type'] = pyspi_res['relevance_type'].replace("Relevant non-target", "Relevant-non-target")
    # Rename stimulus to stimulus_presentation if it is present
    if 'stimulus' in pyspi_res.columns:
        if 'stimulus_presentation' in pyspi_res.columns:
            pyspi_res.drop(columns=['stimulus'], inplace=True)
        else:
            pyspi_res = pyspi_res.rename(columns={'stimulus': 'stimulus_presentation'})

    all_pyspi_res_list.append(pyspi_res)
all_pyspi_res = pd.concat(all_pyspi_res_list)

# Define comparisons

# meta-ROI comparisons
Meta_ROIs = ["Category_Selective", "IPS", "Prefrontal_Cortex", "V1_V2"]
meta_ROI_comparisons = list(itertools.permutations(Meta_ROIs, 2))

# Relevance type comparisons
relevance_type_comparisons = ["Relevant-non-target", "Irrelevant"]

# Stimulus presentation comparisons
stimulus_presentation_comparisons = ["on", "off"]

# Stimulus type comparisons
stimulus_types = all_pyspi_res.stimulus_type.unique().tolist()
stimulus_type_comparisons = list(itertools.combinations(stimulus_types, 2))

# Also add in face vs. non-face
stimulus_type_comparisons.append(("face", "non-face"))


In [3]:
def cross_task_classifier(direction, meta_roi_comparison, stimulus_presentation, pyspi_data):
    ROI_from, ROI_to = meta_roi_comparison
    # Filter pyspi data
    pyspi_data = (pyspi_data.query("meta_ROI_from == @ROI_from & meta_ROI_to == @ROI_to & stimulus_presentation == @stimulus_presentation")
                    .reset_index(drop=True)
                    .drop(columns=['index']))
    
    # All comparisons list
    cross_task_classification_results_list = []

    for SPI in pyspi_data.SPI.unique():
        # Extract this SPI
        this_SPI_data = pyspi_data.query(f"SPI == '{SPI}'")

        # Find overall number of rows
        num_rows = this_SPI_data.shape[0]

        # Extract SPI values
        this_column_data = this_SPI_data["value"]

        # Find number of NaN in this column 
        num_NaN = this_column_data.isna().sum()
        prop_NaN = num_NaN / num_rows

        # Find mode and SD
        column_mode_max = this_column_data.value_counts().max()
        column_SD = this_column_data.std()

        # If 0% < num_NaN < 10%, impute by the mean of each component
        if 0 < prop_NaN < 0.1:
            values_imputed = (this_column_data
                                .transform(lambda x: x.fillna(x.mean())))

            this_column_data = values_imputed
            print(f"Imputing column values for {SPI}")
            this_SPI_data["value"] = this_column_data

        # If there are: 
        # - more than 10% NaN values;
        # - more than 90% of the values are the same; OR
        # - the standard deviation is less than 1*10**(-10)
        # then remove the column
        if prop_NaN > 0.1 or column_mode_max / num_rows > 0.9 or column_SD < 1*10**(-10):
            print(f"{SPI} has low SD: {column_SD}, and/or too many mode occurences: {column_mode_max} out of {num_rows}, and/or {100*prop_NaN}% NaN")
            continue
    
        # Iterate over stimulus combos
        for this_combo in stimulus_type_comparisons:

            # Subset data to the corresponding stimulus pairs
            if this_combo == ("face", "non-face"):
                final_dataset_for_classification_this_combo = this_SPI_data.assign(stimulus_type = lambda x: np.where(x.stimulus_type == "face", "face", "non-face"))
            else:
                final_dataset_for_classification_this_combo = this_SPI_data.query(f"stimulus_type in {this_combo}")

            if direction == "relevant_to_irrelevant":
                train_df = final_dataset_for_classification_this_combo.query("relevance_type == 'Relevant-non-target'")
                test_df = final_dataset_for_classification_this_combo.query("relevance_type == 'Irrelevant'")
            else:
                train_df = final_dataset_for_classification_this_combo.query("relevance_type == 'Irrelevant'")
                test_df = final_dataset_for_classification_this_combo.query("relevance_type == 'Relevant-non-target'")

            # Make a deepcopy of the pipeline
            this_iter_pipe = deepcopy(pipe)

            # Fit classifier
            X_train = train_df.value.to_numpy().reshape(-1, 1)
            y_train = train_df.stimulus_type.to_numpy().reshape(-1, 1)
            X_test = test_df.value.to_numpy().reshape(-1, 1)
            y_test = test_df.stimulus_type.to_numpy().reshape(-1, 1)

            this_iter_pipe.fit(X_train, y_train)
            y_pred = this_iter_pipe.predict(X_test)

            # Compute accuracy, balanced accuracy, and AUC
            accuracy = accuracy_score(y_test, y_pred)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            
            this_SPI_combo_df = pd.DataFrame({"SPI": [SPI], 
                    "classifier": [classifier],
                    "meta_ROI_from": [ROI_from],
                    "meta_ROI_to": [ROI_to],
                    "cross_task_direction": [direction],
                    "stimulus_presentation": [stimulus_presentation],
                    "stimulus_combo": [this_combo], 
                    "accuracy": [accuracy],
                    "balanced_accuracy": [balanced_accuracy]})
            
            # Append to growing results list
            cross_task_classification_results_list.append(this_SPI_combo_df)

    # Concatenate all results
    cross_task_classification_results_df = pd.concat(cross_task_classification_results_list)

    # Return results
    return cross_task_classification_results_df


In [4]:
# Create a dataframe with all combinations of direction, stimulus_presentation, and meta_roi_comparison
all_combos = list(itertools.product(["relevant_to_irrelevant", "irrelevant_to_relevant"], 
                                    ["on", "off"], 
                                    meta_ROI_comparisons))

# Convert to a dataframe
all_combos_df = pd.DataFrame(all_combos, columns=["cross_task_direction", "stimulus_presentation", "meta_ROI_comparison"])


In [None]:
# direction = "relevant_to_irrelevant"
# stimulus_presentation = "on"
# meta_roi_comparison = meta_roi_comparisons[0]
# meta_roi_from, meta_roi_to = meta_roi_comparison

for direction in ["relevant_to_irrelevant", "irrelevant_to_relevant"]:
    for stimulus_presentation in stimulus_presentation_comparisons:
        for meta_roi_comparison in meta_roi_comparisons:
            meta_roi_from, meta_roi_to = meta_roi_comparison
            example_pyspi_data = all_pyspi_res.query(f"meta_ROI_from == '{meta_roi_from}' & meta_ROI_to == '{meta_roi_to}' & stimulus_presentation == '{stimulus_presentation}' & SPI in ['cov_EmpiricalCovariance', 'prec_EmpiricalCovariance']")
            trial = cross_task_classifier(direction=direction, 
                                        meta_roi_comparison=meta_roi_comparison, 
                                        stimulus_presentation=stimulus_presentation, 
                                        pyspi_data=example_pyspi_data)

In [None]:
n_jobs = 1
test = Parallel(n_jobs=int(n_jobs))(delayed(cross_task_classifier)(direction=direction, 
                                                                    meta_roi_comparison=meta_roi_comparison, 
                                                                    stimulus_presentation=stimulus_presentation, 
                                                                    pyspi_data=all_pyspi_res.query("SPI in ['cov_EmpiricalCovariance', 'prec_EmpiricalCovariance']"))
                                                for direction, stimulus_presentation, meta_roi_comparison in all_combos)

test_results = pd.concat(test)

In [3]:
test_results = pd.read_csv(f"{classification_res_path_averaged}/cross_task_Logistic_Regression_classification_results.csv")