In [None]:
import pandas as pd

mimic_path = "/mnt/data/MIMIC/physionet.org/files/mimiciv/2.2/hosp/"
patients =  pd.read_csv(mimic_path + "patients.csv.gz", compression='gzip')
admissions = pd.read_csv(mimic_path + "admissions.csv.gz", compression='gzip')

mimic_cxr_path = "/mnt/data/MIMIC-CXR-JPG/physionet.org/files/mimic-cxr-jpg/2.1.0/"
split = pd.read_csv(mimic_cxr_path + "mimic-cxr-2.0.0-split.csv.gz", compression='gzip')
label = pd.read_csv(mimic_cxr_path + "mimic-cxr-2.0.0-chexpert.csv.gz", compression='gzip')

In [None]:
label.replace({-1: 0}, inplace=True)
label.fillna(0, inplace=True)

In [None]:
mimic_cxr_df = pd.merge(split, label, on=['subject_id', 'study_id'])
len(mimic_cxr_df)

In [None]:
df = pd.merge(mimic_cxr_df, patients, on='subject_id')
len(df)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu

# List of conditions to check
conditions = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 
              'Fracture', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

# Descriptive statistics
for condition in conditions:
    print(f"Condition: {condition}")
    print(df.groupby(condition)['anchor_age'].describe(), "\n")

# Visualize age distribution
for condition in conditions:
    sns.boxplot(x=condition, y='anchor_age', data=df)
    plt.title(f'Age Distribution by {condition}')
    plt.show()

# Statistical tests
for condition in conditions:
    group1 = df[df[condition] == 1]['anchor_age']
    group2 = df[df[condition] == 0]['anchor_age']
    t_stat, p_value = ttest_ind(group1, group2, nan_policy='omit')
    print(f'T-Test for age and {condition}: t-statistic = {t_stat}, p-value = {p_value}')
    # Alternatively, for non-parametric test
    u_stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
    print(f'Mann-Whitney U Test for age and {condition}: u-statistic = {u_stat}, p-value = {p_value}\n')


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Assuming gender is coded as 0 and 1 (e.g., 0 for male, 1 for female)

# Descriptive statistics
for condition in conditions:
    print(f"Condition: {condition}")
    print(df.groupby([condition, 'gender']).size().unstack(), "\n")

# Visualize gender distribution
for condition in conditions:
    sns.countplot(x='gender', hue=condition, data=df)
    plt.title(f'Gender Distribution by {condition}')
    plt.show()

# Chi-square tests
for condition in conditions:
    contingency_table = pd.crosstab(df['gender'], df[condition])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    print(f'Chi-Square Test for gender and {condition}: chi2 = {chi2}, p-value = {p}\n')
