In [4]:
# Import necessary libraries and functions
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests
from sklearn.decomposition import PCA
from scipy.stats import entropy

In [3]:
# Dataset creation for post-hoc comparisons 

# Load the dataset
file_path = '/home/cerna3/neuroconn/data analyses/final_combined_data_compressed.xlsx'
data = pd.read_excel(file_path)

# Function to parse connectivity values, handling within/between and EC/EO
def parse_connectivity(data, key, suffix, is_between=False):
    records = []
    for idx, row in data.iterrows():
        group = row['Group']
        mode = row['Mode']
        conn_dict = eval(row[key])

        for networks, states in conn_dict.items():
            if is_between:
                networks = ','.join(networks)  # Combine network pair into single string
            for state, value in states.items():
                records.append({
                    'ID': row['ID'],
                    'Group': group,
                    'Mode': mode,
                    'Network' if not is_between else 'Network_Pair': networks,
                    'State': state,
                    'Value': value
                })
    return pd.DataFrame(records)

# Function to parse transition magnitudes
def parse_transition_magnitudes(data, key, suffix, is_between=False):
    records = []
    for idx, row in data.iterrows():
        group = row['Group']
        mode = row['Mode']
        tran_mag_dict = eval(row[key])
        for networks, states in tran_mag_dict.items():
            if is_between:
                networks = ','.join(networks) 
            for state, value in states.items():
                if isinstance(value, list):
                    value = value[0] if value else None
                records.append({
                    'ID': row['ID'],
                    'Group': group,
                    'Mode': mode,
                    'Network' if not is_between else 'Network_Pair': networks,
                    'State': state,
                    'Value': value
                })
    
    return pd.DataFrame(records)

def add_temporal_variables(data, temporal_vars, var_name):
    records = []
    for idx, row in data.iterrows():
        mode = row['Mode']
        var_values = temporal_vars[idx]
        
        if mode == 'EC' and len(var_values) >= 7:
            for state, value in enumerate(var_values[:7]):
                records.append({
                    'ID': row['ID'],
                    'Group': row['Group'],
                    'Mode': mode,
                    'State': state,
                    f'{var_name}': value
                })
        elif mode == 'EO' and len(var_values) >= 5:
            for state, value in enumerate(var_values[:5]):
                records.append({
                    'ID': row['ID'],
                    'Group': row['Group'],
                    'Mode': mode,
                    'State': state,
                    f'{var_name}': value
                })
    
    return pd.DataFrame(records)

def add_transition_probabilities(data, transition_probs):
    records = []
    
    for idx, row in data.iterrows():
        mode = row['Mode']
        id = row['ID']
        group = row['Group']
        trans_probs = transition_probs[idx]
        
        if not isinstance(trans_probs, list) or not trans_probs:
            print(f"Warning: Unexpected format for transition probabilities for ID {id}, Mode {mode}")
            continue
        
        # Extract the inner list
        if len(trans_probs) == 1 and isinstance(trans_probs[0], list):
            trans_probs = trans_probs[0]
        
        matrix_size = 7 if mode == 'EC' else 5
        if len(trans_probs) != matrix_size * matrix_size:
            print(f"Warning: Insufficient data for ID {id}, Mode {mode}. Length: {len(trans_probs)}")
            continue
        
        # Reshape into matrix
        trans_probs_matrix = np.array(trans_probs).reshape(matrix_size, matrix_size)
        
        # Perform PCA
        pca = PCA(n_components=1)
        pca_result = pca.fit_transform(trans_probs_matrix)
        
        # Calculate entropy for each row (state)
        entropies = [entropy(row) for row in trans_probs_matrix]
        
        for state in range(matrix_size):
            records.append({
                'ID': id,
                'Group': group,
                'Mode': mode,
                'State': state,
                'Transition_Probabilities_PCA': pca_result[state][0],
                'Transition_Probabilities_Entropy': entropies[state]
            })
    
    result_df = pd.DataFrame(records)
    if result_df.empty:
        print("Warning: No transition probabilities were processed.")
    else:
        print(f"Processed {len(result_df)} transition probability records.")
    return result_df

# --- Data extraction and parsing ---
within_conn_mean_df = parse_connectivity(data, 'Within_Network_Conn_Mean', '_w_mean')
within_tran_mag_df = parse_transition_magnitudes(data, 'Within_Network_Transition_Magnitudes', '_w_tran_mag')
between_conn_mean_df = parse_connectivity(data, 'Between_Network_Conn_Mean', '_b_mean', is_between=True)
between_tran_mag_df = parse_transition_magnitudes(data, 'Between_Network_Transition_Magnitudes', '_b_tran_mag', is_between=True)

# --- Temporal variable extraction and processing ---
def format_temporal_var(column, is_transition_prob=False):
    def format_single_value(x):
        if is_transition_prob:
            # Remove all brackets and split by spaces
            values = x.replace('[', '').replace(']', '').split()
            return [float(val) for val in values if val.replace('.', '').isdigit()]
        else:
            # For other temporal variables, keep the existing logic
            values = x.strip('[]').split()
            return [float(val) for val in values if val.replace('.', '').isdigit()]
    
    return column.apply(format_single_value)

# Modify the temporal_vars extraction:
temporal_vars = {
    'Transition_Probabilities': format_temporal_var(data['Transition_Probabilities'], is_transition_prob=True).tolist(),
    'Mean_Lifetime': format_temporal_var(data['Mean_Lifetime']).tolist(),
    'Fractional_Occupancy': format_temporal_var(data['Fractional_Occupancy']).tolist(),
    'Mean_Interval_Length': format_temporal_var(data['Mean_Interval_Length']).tolist(),
}

# Create temporal dataframes
mean_lifetime_df = add_temporal_variables(data, temporal_vars['Mean_Lifetime'], 'Mean_Lifetime')
fractional_occupancy_df = add_temporal_variables(data, temporal_vars['Fractional_Occupancy'], 'Fractional_Occupancy')
mean_interval_length_df = add_temporal_variables(data, temporal_vars['Mean_Interval_Length'], 'Mean_Interval_Length')
transition_probabilities_df = add_transition_probabilities(data, temporal_vars['Transition_Probabilities'])

# Combine all temporal dataframes
temporal_df = mean_lifetime_df.merge(fractional_occupancy_df, on=['ID', 'Group', 'Mode', 'State'], how='outer')
temporal_df = temporal_df.merge(mean_interval_length_df, on=['ID', 'Group', 'Mode', 'State'], how='outer')
temporal_df = temporal_df.merge(transition_probabilities_df, on=['ID', 'Group', 'Mode', 'State'], how='outer')

def remove_temporal_duplicates(df):
    temporal_vars = ['Mean_Lifetime', 'Fractional_Occupancy', 'Mean_Interval_Length', 
                     'Transition_Probabilities_PCA', 'Transition_Probabilities_Entropy']
    
    for var in temporal_vars:
        if var not in df.columns:
            print(f"Column {var} not found in DataFrame. Skipping...")
            continue
        
        # Sort the dataframe by ID, Group, Mode, State, and the temporal variable
        df = df.sort_values(['ID', 'Group', 'Mode', 'State', var])
        
        # Remove duplicates, keeping the first occurrence
        df = df.drop_duplicates(subset=['ID', 'Group', 'Mode', 'State', var], keep='first')
    
    return df

# --- Pivot spatial data to wide format for easier analysis ---
def pivot_data(df, suffix):
    if suffix in ['_w_mean', '_w_tran_mag']:
        # For within-network data
        pivoted = df.pivot_table(
            index=['ID', 'Group', 'Mode', 'State'],
            columns='Network',
            values='Value'
        ).reset_index()
        
        # Rename columns
        pivoted.columns.name = None
        pivoted.rename(columns={col: f"{col}{suffix}" for col in pivoted.columns if col not in ['ID', 'Group', 'Mode', 'State']}, inplace=True)
    else:
        # For between-network data
        pivoted = df.pivot_table(
            index=['ID', 'Group', 'Mode', 'State'],
            columns='Network_Pair',
            values='Value'
        ).reset_index()
        
        # Rename columns, replacing comma with underscore
        pivoted.columns.name = None
        pivoted.rename(columns={col: f"{col.replace(', ', '_')}{suffix}" for col in pivoted.columns if col not in ['ID', 'Group', 'Mode', 'State']}, inplace=True)
    
    return pivoted

# --- Pivot spatial variables ---
within_conn_mean_pivot = pivot_data(within_conn_mean_df, '_w_mean')
within_tran_mag_pivot = pivot_data(within_tran_mag_df, '_w_tran_mag')
between_conn_mean_pivot = pivot_data(between_conn_mean_df, '_b_mean')
between_tran_mag_pivot = pivot_data(between_tran_mag_df, '_b_tran_mag')

# Merge spatial dataframes
spatial_df = (
    within_conn_mean_pivot
    .merge(within_tran_mag_pivot, on=['ID', 'Group', 'Mode', 'State'])
    .merge(between_conn_mean_pivot, on=['ID', 'Group', 'Mode', 'State'])
    .merge(between_tran_mag_pivot, on=['ID', 'Group', 'Mode', 'State'])
)

# Load the original dataset again (if needed)
original_data_path = '/home/cerna3/neuroconn/data analyses/final_combined_data_compressed.xlsx'
original_data = pd.read_excel(original_data_path)

# Extract final_score_miniBEST, Practice, and Age from the original dataset
original_variables = original_data[['ID', 'final_score_miniBEST', 'Practice', 'Age']]

# Merge spatial and temporal dataframes
combined_df = spatial_df.merge(temporal_df, on=['ID', 'Group', 'Mode', 'State'], how='outer')

# Merge final_score_miniBEST, Practice, and Age into the new dataset
combined_df_with_additional_info = pd.merge(combined_df, original_variables, on='ID', how='left')

# Remove duplicates from temporal variables
combined_df_with_additional_info = remove_temporal_duplicates(combined_df_with_additional_info)

# --- Save and display ---
output_file_path = '/home/cerna3/neuroconn/data analyses/combined_network_data.xlsx'
combined_df_with_additional_info.to_excel(output_file_path, index=False)

print(combined_df_with_additional_info.info())

Processed 540 transition probability records.
<class 'pandas.core.frame.DataFrame'>
Index: 540 entries, 0 to 1078
Data columns (total 68 columns):
 #   Column                                                                       Non-Null Count  Dtype  
---  ------                                                                       --------------  -----  
 0   ID                                                                           540 non-null    int64  
 1   Group                                                                        540 non-null    object 
 2   Mode                                                                         540 non-null    object 
 3   State                                                                        540 non-null    int64  
 4   Default_w_mean                                                               540 non-null    float64
 5   DorsalAttention_w_mean                                                       540 non-null    float64
 6   

In [4]:
# Univariate Pairwise Comparisons: Mann-Whitney U 

# Load the data from the provided Excel file
file_path = '/home/cerna3/neuroconn/data analyses/combined_network_data.xlsx'
data = pd.read_excel(file_path)

# Remove duplicates based on ID and Group
data = data.drop_duplicates(subset=['ID', 'Group'])

# Aggregate connectivity variables from the original dataset
within_network_columns = [col for col in data.columns if '_w_mean' in col]
within_network_tran_mag_columns = [col for col in data.columns if '_w_tran_mag' in col]
between_network_columns = [col for col in data.columns if '_b_mean' in col]
between_network_tran_mag_columns = [col for col in data.columns if '_b_tran_mag' in col]

# Include temporal variables
temporal_columns = ['Mean_Lifetime', 'Fractional_Occupancy', 'Mean_Interval_Length', 'Transition_Probabilities_PCA', 'Transition_Probabilities_Entropy']

connectivity_columns = within_network_columns + within_network_tran_mag_columns + between_network_columns + between_network_tran_mag_columns + temporal_columns

# Filter the data for YACs and OACs groups
age_comparison_data = data[data['Group'].isin(['YACs', 'OACs'])]

# Filter the data for OACs and TCOAs groups
practice_comparison_data = data[data['Group'].isin(['OACs', 'TCOAs'])]

def perform_mann_whitney_comparisons(data, group_column, connectivity_columns):
    results = []
    for column in connectivity_columns:
        levels = data[group_column].unique()
        comparisons = [(i, j) for i in levels for j in levels if i < j]
        for i, j in comparisons:
            group_i = data[data[group_column] == i][column].dropna()
            group_j = data[data[group_column] == j][column].dropna()
            statistic, p_value = stats.mannwhitneyu(group_i, group_j, alternative='two-sided')
            mean_i, mean_j = group_i.mean(), group_j.mean()
            median_i, median_j = group_i.median(), group_j.median()

            # Calculate mean and median differences
            mean_diff = mean_i - mean_j
            median_diff = median_i - median_j

            # Calculate Rank Biserial Correlation
            n1, n2 = len(group_i), len(group_j)
            rank_biserial_correlation = 2 * (statistic / (n1 * n2)) - 1

            results.append([i, j, column, mean_i, mean_j, mean_diff, median_i, median_j, median_diff, statistic, p_value, rank_biserial_correlation])

    results_df = pd.DataFrame(results, columns=['group1', 'group2', 'variable', 'mean1', 'mean2', 'mean_diff',
                                                'median1', 'median2', 'median_diff', 'U-statistic', 'p-value',
                                                'Rank Biserial Correlation (r)'])

    # Apply multiple testing corrections
    _, pvals_fdr, _, _ = multipletests(results_df['p-value'], method='fdr_bh')

    # Add FDR-adjusted p-value column to results DataFrame
    results_df.insert(results_df.columns.get_loc('p-value') + 1, 'FDR-adjusted p-value', pvals_fdr)

    return results_df

# Perform Mann-Whitney U comparisons for age (YACs vs OACs)
age_results = perform_mann_whitney_comparisons(age_comparison_data, 'Group', connectivity_columns)

# Perform Mann-Whitney U comparisons for practice (OACs vs TCOAs)
practice_results = perform_mann_whitney_comparisons(practice_comparison_data, 'Group', connectivity_columns)

# Save the results to an Excel file with different tabs
with pd.ExcelWriter('mann_whitney_results.xlsx') as writer:
    age_results.to_excel(writer, sheet_name='age_comparison_results', index=False)
    practice_results.to_excel(writer, sheet_name='practice_comparison_results', index=False)

print("Mann-Whitney U test results have been saved to 'mann_whitney_results.xlsx'")

Mann-Whitney U test results have been saved to 'mann_whitney_results.xlsx'
