In [34]:
import os
import numpy as np
import pandas as pd
import sys
import json
from glob import glob
import random 

# Add rpy2
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [22]:
%%R 

suppressPackageStartupMessages({
    library(cowplot)
    library(see)
    library(tidyverse)

    theme_set(theme_cowplot())
})

In [2]:
metadata_path = "/Users/abry4213/data/Cogitate_MEG/derivatives/additional_metadata"

# Find folders matching the pattern "sub-*" in the metadata directory
subjects = sorted(glob(os.path.join(metadata_path, "sub-*")))

# Create a dictionary to store the metadata
metadata_list = []

# Loop through each subject folder
for subject in subjects:
    # Get the subject ID
    subject_id = os.path.basename(subject)
    # Remove the sub- prefix
    sub_base = subject_id.split("-")[1]
    # Load the metadata file
    with open(f"{metadata_path}/{subject_id}/METADATA/{sub_base}_demographics.json", "r") as f:
        subject_metadata = pd.DataFrame(json.load(f), index=[0]).assign(participant_id=subject_id)
    
    if "gender" in subject_metadata.columns:
        subject_metadata = (subject_metadata.rename(columns={"gender": "sex",
                                                            "primary": "primary_language",
                                                            "secondary": "secondary_language",
                                                            "hand": "handedness",
                                                            "eye": "eyedominance"})
                            .assign(handedness = lambda x: x.handedness.str.lower(),
                                    sex = lambda x: np.where(x.sex == "Male", "M", "F")))
    
    # Just select desired columns, substitue NA for any missing values
    try:
        subject_metadata = subject_metadata[["participant_id", "sex", "age", "handedness", "height", "weight", "primary_language", "secondary_language", "education",  "colorblind"]]
    except KeyError as e:
        # Find which key is missing
        missing_key = str(e).split("'")[1]
        # Add the missing key to the dataframe
        subject_metadata[missing_key] = np.nan
        # Reorder the columns
        subject_metadata = subject_metadata[["participant_id", "sex", "age", "handedness", "height", "weight", "primary_language", "secondary_language", "education", "colorblind"]]


    # Reset index
    metadata_list.append(subject_metadata)

metadata = pd.concat(metadata_list, ignore_index=True)

# Set 'n/a' to NaN in metadata.weight
metadata['height'] = metadata['height'].replace('n/a', np.nan)
metadata['weight'] = metadata['weight'].replace('n/a', np.nan)

# Replace commas with period in metadata.weight
metadata['weight'] = metadata['weight'].str.replace(',', '.').astype(float)
metadata['height'] = metadata['height'].str.replace(',', '.').astype(float)

In [19]:
# Save the metadata to a CSV file
metadata.to_csv("/Users/abry4213/data/Cogitate_MEG/all_participant_metadata.csv", index=False)

# Load in subjects for whom we have ALL data
subjects_with_all_data = pd.read_csv("/Users/abry4213/github/MEG_functional_connectivity/subject_list_Cogitate_MEG_with_all_data.txt", header=None).reset_index()
subjects_with_all_data.columns = ["index", "participant_id"]
subjects_with_all_data = ["sub-" + s for s in subjects_with_all_data.participant_id.tolist()]

# Filter metadata
metadata_filtered = metadata[metadata.participant_id.isin(subjects_with_all_data)]
metadata_filtered.to_csv("/Users/abry4213/data/Cogitate_MEG/all_participant_metadata_filtered.csv", index=False)

In [28]:
# How many participants of each sex do we have?
metadata_filtered.groupby(["sex"]).size().reset_index().rename(columns={0: "count"})

Unnamed: 0,sex,count
0,F,54
1,M,40


In [33]:
# What is the average age per sex?
metadata_filtered.groupby(["sex"]).age.mean().reset_index().rename(columns={"age": "average_age"})

Unnamed: 0,sex,average_age
0,F,23.055556
1,M,22.275


In [32]:
%%R -i metadata_filtered
# View age and sex distributions

metadata_filtered %>% 
    ggplot(data=., mapping=aes(x=sex, y=age, fill=sex, color=sex)) +
    geom_violinhalf(scale="width", color="black", position = position_dodge(width = 2), width=1)  +
    geom_point(position = position_jitter(width = 0.05, height=0),
                            size = 2.75, alpha=0.8, stroke=0) +
    stat_summary(color="black", fun="mean", geom="crossbar", width=0.2, linewidth=0.3, 
            show.legend=FALSE, position = position_dodge(width=1.4)) +
    ylab("Age (Years)") +
    xlab("Sex") +
    ggtitle("Age and Sex Distribution in\nCogitate MEG Participants (N=94)") +
    scale_fill_manual(values=c("F"="#D091B5", "M"="#55B3E2")) +
    scale_color_manual(values=c("F"="#D091B5", "M"="#55B3E2")) +
    theme(legend.position="none",
          plot.title = element_text(hjust=0.5))
ggsave("plots/age_by_sex.svg", width=4, height=4, units="in", dpi=300)



In [57]:
# simulate example time-series for onset and offset across four brain regions
random.seed(127)
M = 4 # 4 time series
T = 35 # 35 samples per process
simulated_MTS_onset = np.random.randn(M,T) # generate our multivariate time-series
simulated_MTS_df_onset = pd.DataFrame(simulated_MTS_onset.T, columns=["Category_Selective", "Visual", "Parietal_Integration", "Prefrontal_Cortex"]).assign(timepoint = np.arange(1, T+1))

random.seed(27)
simulated_MTS_offset = np.random.randn(M,T) # generate our multivariate time-series
simulated_MTS_df_offset = pd.DataFrame(simulated_MTS_offset.T, columns=["Category_Selective", "Visual", "Parietal_Integration", "Prefrontal_Cortex"]).assign(timepoint = np.arange(1, T+1))

In [58]:
%%R -i simulated_MTS_df_onset,simulated_MTS_df_offset

# Plot the simulated time series
simulated_MTS_df_onset %>% 
    pivot_longer(cols=c(-timepoint), names_to="meta_ROI", values_to="signal") %>%
    ggplot(data=., mapping=aes(x=timepoint, y=signal, color=meta_ROI)) +
    scale_color_viridis_d() +
    geom_line() +
    facet_grid(meta_ROI ~ .) +
    theme_void() +
    theme(legend.position = "none",
            strip.text = element_blank(),
            panel.spacing = unit(-0.75, "lines"))
ggsave("plots/simulated_time_series_onset.svg", width=3, height=1.5, units="in", dpi=300)

simulated_MTS_df_offset %>%
    pivot_longer(cols=c(-timepoint), names_to="meta_ROI", values_to="signal") %>%
    ggplot(data=., mapping=aes(x=timepoint, y=signal, color=meta_ROI)) +
    scale_color_viridis_d() +
    geom_line() +
    facet_grid(meta_ROI ~ .) +
    theme_void() +
    theme(legend.position = "none",
            strip.text = element_blank(),
            panel.spacing = unit(-0.75, "lines"))
ggsave("plots/simulated_time_series_offset.svg", width=3, height=1.5, units="in", dpi=300)