In [2]:
import pandas as pd
data = pd.read_csv("./data/participant_demo_clinical.csv")

In [7]:
def create_summary_table(df):
    from scipy import stats
    import pandas as pd

    # Preprocessing
    df.columns = df.columns.str.strip()  # Strip any whitespace from the column names

    # Selecting relevant columns
    selected_columns = ['GROUP', '2.AGE', '1. SEX'] + [col for col in df.columns if col.startswith(('LSAS', 'MOCI', 'BFNE', 'PSWQ', 'Handedness', 'FCV-19S'))]
    df_selected = df[selected_columns]

    # Renaming columns for better readability
    df_selected.rename(columns={'2.AGE': 'Age', '1. SEX': 'Sex'}, inplace=True)

    # Splitting data into experimental (EXP) and control (HC) groups
    exp_group = df_selected[df_selected['GROUP'] == 'EXP']
    hc_group = df_selected[df_selected['GROUP'] == 'HC']

    # Define a function to calculate mean (SD) and p-value
    def mean_sd(series):
        return f"{series.mean():.2f} ({series.std():.2f})"

    def calculate_pvalue(exp_series, hc_series):
        return stats.ttest_ind(exp_series, hc_series, nan_policy='omit').pvalue

    # Create a summary table
    summary_table = pd.DataFrame(columns=['Measure', 'EXP', 'HC', 'p-value'])

    for column in df_selected.columns[2:]:  # Exclude 'GROUP', 'Age', 'Sex' columns
        exp_mean_sd = mean_sd(exp_group[column])
        hc_mean_sd = mean_sd(hc_group[column])
        p_value = calculate_pvalue(exp_group[column], hc_group[column])
        
        summary_table = pd.concat([summary_table, pd.DataFrame([{'Measure': column, 'EXP': exp_mean_sd, 'HC': hc_mean_sd, 'p-value': p_value}])], ignore_index=True)

    # Adding Age and Sex columns separately
    summary_table = pd.concat([summary_table, pd.DataFrame([{
        'Measure': 'Age',
        'EXP': mean_sd(exp_group['Age']),
        'HC': mean_sd(hc_group['Age']),
        'p-value': calculate_pvalue(exp_group['Age'], hc_group['Age'])
    }])], ignore_index=True)

    # Sex should be handled as a percentage of females
    exp_female_pct = (exp_group['Sex'].value_counts(normalize=True).get(2, 0)) * 100
    hc_female_pct = (hc_group['Sex'].value_counts(normalize=True).get(2, 0)) * 100

    summary_table = pd.concat([summary_table, pd.DataFrame([{
        'Measure': 'Sex (Female %)',
        'EXP': f"{exp_female_pct:.2f}%",
        'HC': f"{hc_female_pct:.2f}%",
        'p-value': calculate_pvalue(exp_group['Sex'] == 2, hc_group['Sex'] == 2)
    }])], ignore_index=True)

    return summary_table

# Using the function to create the summary table
summary_df = create_summary_table(data)
summary_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.rename(columns={'2.AGE': 'Age', '1. SEX': 'Sex'}, inplace=True)


Unnamed: 0,Measure,EXP,HC,p-value
0,Sex,1.43 (0.50),1.51 (0.50),0.4477457
1,LSAS_performance,40.39 (12.24),12.33 (8.80),6.907013e-26
2,LSAS_social_interaction,35.17 (12.71),10.49 (7.96),8.725463000000001e-23
3,LSAS,75.56 (24.27),22.83 (16.11),2.373603e-25
4,MOCI,17.37 (5.15),21.04 (8.04),0.00999072
5,MOCI_checking,4.59 (2.36),6.39 (2.53),0.0003236329
6,MOCI_cleaning,8.12 (2.33),8.30 (3.84),0.7834187
7,MOCI_doubting,2.98 (2.06),4.04 (1.72),0.004180667
8,MOCI_slowness,3.93 (1.71),5.42 (2.32),0.0005047485
9,BFNE,50.56 (6.55),32.84 (8.20),4.326774e-21


In [8]:
summary_df.to_csv("./data/summary.csv")