In [3]:
import os
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from glob import glob
import itertools

# Statistics
from tslearn import barycenters
from scipy.stats import wasserstein_distance

# Classification
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedGroupKFold, cross_validate, StratifiedKFold, LeaveOneOut, cross_val_predict
from sklearn.pipeline import Pipeline

# add path to classification analysis functions
from mixed_sigmoid_normalisation import MixedSigmoidScaler

# Add rpy2
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [None]:
%%R 

suppressPackageStartupMessages({
    library(broom)
    library(cowplot)
    library(glue)
    library(patchwork)
    library(see)
    library(tidyverse)
})

# Set cowplot theme
theme_set(theme_cowplot())

In [5]:
# load pyspi SPI info
pyspi_SPI_info = pd.read_csv("../feature_extraction/pyspi_SPI_info.csv")

# Define path for derivatives directory
deriv_dir = "/Users/abry4213/data/Cogitate_MEG/derivatives"

# Initialize list for pyspi results
all_time_resolved_barycenter_res_list = []

# Load in time-resolved barycenter results
for barycenter_time_res_file in glob(f"{deriv_dir}/time_series_features/averaged_epochs/*barycenter*.csv"):
    barycenter_time_resolved_res = pd.read_csv(barycenter_time_res_file)
    all_time_resolved_barycenter_res_list.append(barycenter_time_resolved_res)

# Concatenate pyspi results
all_time_resolved_barycenter_res = pd.concat(all_time_resolved_barycenter_res_list)
all_time_resolved_barycenter_res.head()

# Separate out into raw and absolute value barycenter by filtering out where 'Region' contains the string 'abs'
all_raw_time_resolved_barycenter_res = all_time_resolved_barycenter_res.query("Region.str.contains('abs') == False")
all_abs_time_resolved_barycenter_res = all_time_resolved_barycenter_res.query("Region.str.contains('abs') == True").assign(Region=lambda x: x.Region.str.replace("_abs", ""))

In [7]:
all_raw_time_resolved_barycenter_res.head()

Unnamed: 0,Presentation,Barycenter_Type,Mean,Max,Max_Time,Subject,Region,Relevance,Stimulus,Barycenter_Method
0,Onset,Original,-2.8393320000000004e-17,2.314943,563.0,sub-CB049,CS_PFC,Irrelevant,False,euclidean
1,Offset,Original,0.0,2.280326,785.0,sub-CB049,CS_PFC,Irrelevant,False,euclidean
2,Onset,Squared,0.5649296,8.072949,972.0,sub-CB049,CS_PFC,Irrelevant,False,euclidean
3,Offset,Squared,0.5647868,5.199888,785.0,sub-CB049,CS_PFC,Irrelevant,False,euclidean
8,Onset,Original,-2.8393320000000004e-17,1.565117,770.0,sub-CB049,CS_VIS,Irrelevant,False,euclidean


## Robustness check 1: stimulus classification with raw vs. absolute value time series before computing barycenter

In [4]:
# Define classifier
model = LogisticRegression(penalty='l1', C=1, solver='liblinear', class_weight='balanced', random_state=127)

pipe = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)), 
                            ('model', model)])

# Define scoring type
scoring = {'accuracy': 'accuracy',
           'balanced_accuracy': 'balanced_accuracy',
           'AUC': make_scorer(roc_auc_score, response_method='predict_proba')}

# meta-ROI comparisons
meta_ROIs = ["Category_Selective", "IPS", "Prefrontal_Cortex", "V1_V2"]

# Manually define combinations
meta_roi_comparisons = [("Category_Selective", "IPS"),
                        ("Category_Selective", "Prefrontal_Cortex"),
                        ("Category_Selective", "V1_V2"),
                        ("IPS", "Category_Selective"),
                        ("Prefrontal_Cortex", "Category_Selective"),
                        ("V1_V2", "Category_Selective")]

# Relevance type comparisons
relevance_type_comparisons = ["Relevant-non-target", "Irrelevant"]

# Stimulus presentation comparisons
stimulus_presentation_comparisons = ["on", "off"]

# Define all combinations for cross-task classification
group_stratified_CV = StratifiedGroupKFold(n_splits = 10, shuffle = True, random_state=127)

In [None]:
# Stimulus type comparisons
stimulus_types = all_raw_time_resolved_barycenter_res.stimulus_type.unique().tolist()
stimulus_type_comparisons = list(itertools.combinations(stimulus_types, 2))

In [None]:
# All comparisons list
comparing_between_stimulus_types_classification_results_list = []

for relevance_type in relevance_type_comparisons:
    print("Relevance type:" + str(relevance_type))
    for stimulus_presentation in stimulus_presentation_comparisons:
        print("Stimulus presentation:" + str(stimulus_presentation))
        for SPI in all_pyspi_res.SPI.unique():
            # First, look at each meta-ROI pair separately
            for meta_roi_comparison in meta_roi_comparisons:
                print("ROI Comparison:" + str(meta_roi_comparison))
                ROI_from, ROI_to = meta_roi_comparison
                # Finally, we get to the final dataset
                roi_pair_wise_dataset_for_classification = (all_pyspi_res.query("meta_ROI_from == @ROI_from & meta_ROI_to == @ROI_to & relevance_type == @relevance_type & stimulus_presentation == @stimulus_presentation")
                                                            .reset_index(drop=True)
                                                            .drop(columns=['index']))

                # Extract this SPI
                this_SPI_data = roi_pair_wise_dataset_for_classification.query(f"SPI == '{SPI}'")

                # Find overall number of rows
                num_rows = this_SPI_data.shape[0]

                # Extract SPI values
                this_column_data = this_SPI_data["value"]

                # Find number of NaN in this column 
                num_NaN = this_column_data.isna().sum()
                prop_NaN = num_NaN / num_rows

                # Find mode and SD
                column_mode_max = this_column_data.value_counts().max()
                column_SD = this_column_data.std()

                # If 0% < num_NaN < 10%, impute by the mean of each component
                if 0 < prop_NaN < 0.1:
                    values_imputed = (this_column_data
                                        .transform(lambda x: x.fillna(x.mean())))

                    this_column_data = values_imputed
                    print(f"Imputing column values for {SPI}")
                    this_SPI_data["value"] = this_column_data

                # If there are: 
                # - more than 10% NaN values;
                # - more than 90% of the values are the same; OR
                # - the standard deviation is less than 1*10**(-10)
                # then remove the column
                if prop_NaN > 0.1 or column_mode_max / num_rows > 0.9 or column_SD < 1*10**(-10):
                    print(f"{SPI} has low SD: {column_SD}, and/or too many mode occurences: {column_mode_max} out of {num_rows}, and/or {100*prop_NaN}% NaN")
                    continue
                
                # Start an empty list for the classification results
                SPI_combo_res_list = []
            
                # Iterate over stimulus combos
                for this_combo in stimulus_type_comparisons:

                    # Subset data to the corresponding stimulus pairs
                    final_dataset_for_classification_this_combo = this_SPI_data.query(f"stimulus_type in {this_combo}")

                    # Fit classifier
                    X = final_dataset_for_classification_this_combo.value.to_numpy().reshape(-1, 1)
                    y = final_dataset_for_classification_this_combo.stimulus_type.to_numpy().reshape(-1, 1)
                    groups = final_dataset_for_classification_this_combo.subject_ID.to_numpy().reshape(-1, 1)
                    groups_flat = np.array([str(item[0]) for item in groups])


                    # Make a deepcopy of the pipeline
                    this_iter_pipe = deepcopy(pipe)
                    this_classifier_res = cross_validate(this_iter_pipe, X, y, groups=groups_flat, cv=group_stratified_CV, scoring=scoring, n_jobs=n_jobs, 
                                                                return_estimator=False, return_train_score=False)
                    
                    this_SPI_combo_df = pd.DataFrame({"SPI": [SPI], 
                            "classifier": [classifier],
                            "meta_ROI_from": [ROI_from],
                            "meta_ROI_to": [ROI_to],
                            "relevance_type": [relevance_type],
                            "stimulus_presentation": [stimulus_presentation],
                            "stimulus_combo": [this_combo], 
                            "accuracy": [this_classifier_res['test_accuracy'].mean()],
                            "accuracy_SD": [this_classifier_res['test_accuracy'].std()]})
                    
                    # Append to growing results list
                    comparing_between_stimulus_types_classification_results_list.append(this_SPI_combo_df)

comparing_between_stimulus_types_classification_results = pd.concat(comparing_between_stimulus_types_classification_results_list).reset_index(drop=True)
