In [1]:
import os
import numpy as np
import nibabel as nib
import pandas as pd
import sys
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedGroupKFold, cross_validate, StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from copy import deepcopy
import itertools

# add path to classification analysis functions
from mixed_sigmoid_normalisation import MixedSigmoidScaler

In [5]:
# Read in SPI directionality info
SPI_directionality_info = pd.read_csv("../feature_extraction/pyspi_SPI_info.csv")

# Load data paths for batch 1
pyspi_res_path_batch1 = "/Users/abry4213/data/Cogitate_Batch1/MEG_Data/derivatives/time_series_features"
pyspi_res_path_averaged_batch1 = f"{pyspi_res_path_batch1}/averaged_epochs"
pyspi_res_path_individual_batch1 = f"{pyspi_res_path_batch1}/individual_epochs"

classification_res_path_averaged_batch1 = "/Users/abry4213/data/Cogitate_Batch1/MEG_Data/derivatives/classification_results/across_participants"
classification_res_path_individual_batch1 = "/Users/abry4213/data/Cogitate_Batch1/MEG_Data/derivatives/classification_results/within_participants"

In [15]:
pyspi_res.head()

Unnamed: 0,index,SPI,meta_ROI_from,meta_ROI_to,value,stimulus_type,relevance_type,duration,subject_ID,stimulus_presentation
0,1,cov_EmpiricalCovariance,GNWT,Category_Selective,-0.111012,False,Irrelevant,1000,CA109,on
1,2,cov_EmpiricalCovariance,IIT,Category_Selective,0.225986,False,Irrelevant,1000,CA109,on
2,3,cov_EmpiricalCovariance,Category_Selective,GNWT,-0.111012,False,Irrelevant,1000,CA109,on
3,5,cov_EmpiricalCovariance,IIT,GNWT,-0.206013,False,Irrelevant,1000,CA109,on
4,6,cov_EmpiricalCovariance,Category_Selective,IIT,0.225986,False,Irrelevant,1000,CA109,on


In [23]:
# Batch 1 -- individual results

# Load in pyspi results
all_pyspi_res_list_batch1 = []
for pyspi_res_file in os.listdir(pyspi_res_path_averaged_batch1):
    pyspi_res = pd.read_csv(f"{pyspi_res_path_averaged_batch1}/{pyspi_res_file}")
    # Reset index
    pyspi_res.reset_index(inplace=True, drop=True)
    pyspi_res['stimulus_type'] = pyspi_res['stimulus_type'].replace(False, 'false').replace('False', 'false')
    pyspi_res['relevance_type'] = pyspi_res['relevance_type'].replace("Relevant non-target", "Relevant-non-target")
    # Rename stimulus to stimulus_presentation if it is present
    if 'stimulus' in pyspi_res.columns:
        if 'stimulus_presentation' in pyspi_res.columns:
            pyspi_res.drop(columns=['stimulus'], inplace=True)
        else:
            pyspi_res = pyspi_res.rename(columns={'stimulus': 'stimulus_presentation'})
    
    all_pyspi_res_list_batch1.append(pyspi_res)
    
all_pyspi_res_batch1 = pd.concat(all_pyspi_res_list_batch1)


In [24]:
# meta-ROI comparisons: GWNT --> CS, CS --> GNWT, IIT --> CS, CS --> IIT
meta_roi_comparisons = [("GNWT", "Category_Selective"), ("Category_Selective", "GNWT"), ("IIT", "Category_Selective"), ("Category_Selective", "IIT")]

# Relevance type comparisons
relevance_type_comparisons = ["Relevant-non-target", "Irrelevant"]

# Stimulus presentation comparisons
stimulus_presentation_comparisons = ["on", "off"]

# Stimulus type comparisons
stimulus_types = all_pyspi_res_batch1.stimulus_type.unique().tolist()
stimulus_type_comparisons = list(itertools.combinations(stimulus_types, 2))

In [26]:
# All comparisons list
comparing_between_stimulus_types_classification_results_list = []
meta_roi_comparison = meta_roi_comparisons[0]
print("ROI Comparison:" + str(meta_roi_comparison))
ROI_from, ROI_to = meta_roi_comparison
relevance_type = relevance_type_comparisons[0]
stimulus_presentation = "on"
print("Stimulus presentation:" + str(stimulus_presentation))
# Finally, we get to the final dataset
final_dataset_for_classification_batch1 = all_pyspi_res_batch1.query("meta_ROI_from == @ROI_from & meta_ROI_to == @ROI_to & relevance_type == @relevance_type & stimulus_presentation == @stimulus_presentation").reset_index(drop=True).drop(columns=['index'])


ROI Comparison:('GNWT', 'Category_Selective')
Stimulus presentation:on


In [27]:
SPI = "cov_EmpiricalCovariance" 

# Extract this SPI
this_SPI_data = final_dataset_for_classification_batch1.query(f"SPI == '{SPI}'")

# Find overall number of rows
num_rows = this_SPI_data.shape[0]

# Extract SPI values
this_column_data = this_SPI_data["value"]

# Find number of NaN in this column 
num_NaN = this_column_data.isna().sum()
prop_NaN = num_NaN / num_rows

# Find mode and SD
column_mode_max = this_column_data.value_counts().max()
column_SD = this_column_data.std()

# If 0% < num_NaN < 10%, impute by the mean of each component
if 0 < prop_NaN < 0.1:
    values_imputed = (this_column_data
                        .transform(lambda x: x.fillna(x.mean())))

    this_column_data = values_imputed
    print(f"Imputing column values for {SPI}")
    this_SPI_data["value"] = this_column_data

# If there are: 
# - more than 10% NaN values;
# - more than 90% of the values are the same; OR
# - the standard deviation is less than 1*10**(-10)
# then remove the column
if prop_NaN > 0.1 or column_mode_max / num_rows > 0.9 or column_SD < 1*10**(-10):
    print(f"{SPI} has low SD: {column_SD}, and/or too many mode occurences: {column_mode_max} out of {num_rows}, and/or {100*prop_NaN}% NaN")

# Start an empty list for the classification results
SPI_combo_res_list = []


In [33]:
this_combo = ('false', 'face')
n_jobs=1

# Extract just GNWT/CS data first
final_dataset_for_classification_this_combo = this_SPI_data.query(f"stimulus_type in {this_combo}")

# Define classification model
model = LogisticRegression(penalty='l1', C=1, solver='liblinear', random_state=127)
pipe = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)), 
                        ('model', model)])

# Fit classifier
X = final_dataset_for_classification_this_combo.value.to_numpy().reshape(-1, 1)
y = final_dataset_for_classification_this_combo.stimulus_type.to_numpy().reshape(-1, 1)
groups = final_dataset_for_classification_this_combo.subject_ID.to_numpy().reshape(-1, 1)

group_stratified_CV = StratifiedGroupKFold(n_splits = 10, shuffle = True, random_state=127)

this_classifier_res = cross_validate(pipe, X, y, groups=groups, cv=group_stratified_CV, scoring="accuracy", n_jobs=n_jobs, 
                                            return_estimator=False, return_train_score=False)["test_score"].mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
