In [5]:
import numpy as np
import pandas as pd
import os

fpath_input = os.path.abspath('..\\..\\..\\..\\input')
print(fpath_input)

def get_feats(mode='TRAIN'):
    """
    Load data for the specified mode (TRAIN or TEST).
    """
    # Load quantitative metadata
    feats = pd.read_excel(f"{fpath_input}/widsdatathon2025/{mode}/{mode}_QUANTITATIVE_METADATA.xlsx")
    
    # Load categorical metadata
    if mode == 'TRAIN':
        cate = pd.read_excel(f"{fpath_input}/widsdatathon2025/{mode}/{mode}_CATEGORICAL_METADATA.xlsx")
    else:
        cate = pd.read_excel(f"{fpath_input}/widsdatathon2025/{mode}/{mode}_CATEGORICAL.xlsx")
    
    # Merge quantitative and categorical data
    feats = pd.merge(feats, cate, on='participant_id', how='left')
    
    # Load functional connectome matrices
    func = pd.read_csv(f"{fpath_input}/widsdatathon2025/{mode}/{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv")
    feats = pd.merge(feats, func, on='participant_id', how='left')
    
    # Load training solutions (only for TRAIN mode)
    if mode == 'TRAIN':
        solution = pd.read_excel(f"{fpath_input}/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
        feats = pd.merge(feats, solution, on='participant_id', how='left')
    
    return feats

# Load training and test data
print("Loading data...")
train = get_feats(mode='TRAIN')
test = get_feats(mode='TEST')

# Display the first few rows of the training data
train.head()

c:\Users\alan.mcdonagh\OneDrive - Milliman Inc\Projects\51. WiDS Datathon 2025\input
Loading data...


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,...,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984,1,1
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,...,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381,1,0
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,...,0.010771,-0.044341,0.128386,0.047282,0.087678,0.146221,-0.009425,0.03515,1,0
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,...,-0.007152,0.032584,0.121726,0.045089,0.154464,0.106817,0.065336,0.234708,1,1
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,...,-0.010196,0.035638,0.074978,0.030579,0.02564,0.118199,0.112522,0.143666,1,1


In [None]:
print(f'There are {len(train.columns)} columns in total; this will take too long in this demo so we will reduce this to 10.')

cols_x = train.columns[10]
cols_y = ['ADHD_Outcome', 'Sex_F']
cols = [
    *cols_x,
    *cols_y,
]

There are 19930 columns in total; this will take too long in this demo so we will reduce this to 20.


In [5]:
import sweetviz as sv

# Define the FeatureConfig object to force the target features to be numerical
my_feature_config = sv.FeatureConfig(force_num=['ADHD_Outcome', 'Sex_F'])

# Create a boolean array to use as the grouping condition
condition_series = train['ADHD_Outcome'] == 1

# Analyze the dataset with the specified FeatureConfig object and grouping condition
my_report = sv.compare_intra(
    train[cols], 
    condition_series, 
    ['ADHD', 'No_ADHD'], 
    feat_cfg=my_feature_config, 
    target_feat='ADHD_Outcome',
    pairwise_analysis='off',
)

# Generate and display the report
my_report.show_html('compare_intra.html')

                                             |          | [  0%]   00:00 -> (? left)

Report compare_intra.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [6]:
# Pairwise analysis is turned off to avoid detailed pairwise feature comparisons
# This can speed up the report generation and reduce the report size
report = sv.compare(
    [train[cols] , "Train"], 
    [test[cols_x], "Test" ],
    pairwise_analysis='on',
)

report.show_html('compare.html')

                                             |          | [  0%]   00:00 -> (? left)

Report compare.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [7]:
# Pairwise analysis is turned off to avoid detailed pairwise feature comparisons
# This can speed up the report generation and reduce the report size
report = sv.analyze(
    [train[cols] , "Train"],
    #target_feat=cols_y, 
    feat_cfg=my_feature_config, 
    pairwise_analysis='on',
)

report.show_html('analyse.html')

                                             |          | [  0%]   00:00 -> (? left)

Report analyse.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
