In [None]:
import numpy as np
import pandas as pd

# 1. Categorical

In [None]:
train_cat = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx')
train_cat.head()

## Imputation

In [None]:
print(train_cat.isna().any())

In [None]:
mode = train_cat['PreInt_Demos_Fam_Child_Ethnicity'].mode()[0]
print(mode)

In [None]:
train_cat.fillna({'PreInt_Demos_Fam_Child_Ethnicity':mode}, inplace=True)

In [None]:
print(train_cat.isna().any())

## One-hot

In [None]:
for col in train_cat.select_dtypes(include=['int', 'float']).columns:
    train_cat[col] = train_cat[col].astype('category')

In [None]:
columns_to_encode = train_cat.columns[1:].tolist()
columns_to_encode

In [None]:
train_encoded = pd.get_dummies(train_cat[columns_to_encode], drop_first=True)
train_encoded = train_encoded.map(lambda x: 1 if x is True else (0 if x is False else x))
train_encoded.head()

In [None]:
cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
cat_train_final.head()

## Function

In [None]:
def impute_one_hot_categorical(cat, ethnicity_mode=None):
    """
    mode imputation for PreInt_Demos_Fam_Child_Ethnicity
    returns one-hot encoded df
    """
    if ethnicity_mode is None:
        ethnicity_mode = cat['PreInt_Demos_Fam_Child_Ethnicity'].mode()[0]
    cat.fillna({'PreInt_Demos_Fam_Child_Ethnicity':mode}, inplace=True)
    
    for col in cat.select_dtypes(include=['int', 'float']).columns:
        cat[col] = cat[col].astype('category')
    columns_to_encode = cat.columns[1:].tolist()
    cat_one_hot = pd.get_dummies(cat[columns_to_encode], drop_first=True)
    cat_one_hot = cat_one_hot.map(lambda x: 1 if x is True else (0 if x is False else x))
    cat = pd.concat([cat.drop(columns=columns_to_encode), cat_one_hot], axis=1)
    
    return cat, ethnicity_mode
    

In [None]:
train_cat = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx')
train_cat.head()

In [None]:
train_cat_prep, _ = impute_one_hot_categorical(train_cat)
train_cat_prep.head()

# 2. Quantitative

In [None]:
quantitative = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx')
quantitative.head()

## Imputation

In [None]:
print(quantitative.isna().any())

In [None]:
mean = quantitative['MRI_Track_Age_at_Scan'].mean()
print(mean)

In [None]:
quantitative.fillna({'MRI_Track_Age_at_Scan':mean}, inplace=True)

In [None]:
print(quantitative.isna().any())

## Function

In [None]:
def impute_quantitative(quant, mri_age_mean=None):
    """
    mean imputation for MRI_Track_Age_at_Scan
    """
    if mri_age_mean is None:
        mri_age_mean = quant['MRI_Track_Age_at_Scan'].mean()
    quant.fillna({'MRI_Track_Age_at_Scan':mri_age_mean}, inplace=True)
    
    return quant, mri_age_mean
    

In [None]:
quantitative = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx')
print(quantitative.isna().any())

In [None]:
quantitative, _ = impute_quantitative(quantitative)
print(quantitative.isna().any())

# Min-Max Normalize

In [None]:
def normalize(quant):
    numeric_cols = quant.select_dtypes(include=['number'])

    # Min-max normalization
    df_normalized = numeric_cols.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    # Combine with the non-numeric columns
    df_final = quant.copy()
    df_final[numeric_cols.columns] = df_normalized
    return df_final

In [None]:
quantitative = normalize(quantitative)
quantitative.head()

In [None]:
def normalize_two_dfs(df1, df2):
    # Extract numeric columns (excluding the first column)
    numeric_cols = df1.select_dtypes(include=['number']).columns  # Assumes same structure

    # Combine both DataFrames to compute global min/max
    combined = pd.concat([df1[numeric_cols], df2[numeric_cols]])

    # Compute global min/max
    min_ = combined.min()
    max_ = combined.max()

    # Apply normalization
    df1_normalized = df1.copy()
    df2_normalized = df2.copy()
    df1_normalized[numeric_cols] = df1[numeric_cols].apply(lambda x: (x - min_[x.name]) / (max_[x.name] - min_[x.name]))
    df2_normalized[numeric_cols] = df2[numeric_cols].apply(lambda x: (x - min_[x.name]) / (max_[x.name] - min_[x.name]))

    return df1_normalized, df2_normalized

# Finalize

In [1]:
import pandas as pd
import numpy as np

## cat

In [14]:
def impute_mode(df1, df2):
    """
    Mode imputation for all columns with NaN values across two DataFrames.
    """
    # Find columns with NaN values in both DataFrames
    nan_cols_df1 = df1.columns[df1.isna().any()].tolist()
    nan_cols_df2 = df2.columns[df2.isna().any()].tolist()
    
    # Combine the columns with NaN values from both DataFrames
    all_nan_cols = list(set(nan_cols_df1 + nan_cols_df2))
    
    print(f"Imputing mode for columns: {all_nan_cols}")

    # Impute missing values with the mode for each column
    for col in all_nan_cols:
        combined_mode = pd.concat([df1[col], df2[col]]).mode()[0]  # Compute the mode across both DataFrames
        df1.fillna({col: combined_mode}, inplace=True)
        df2.fillna({col: combined_mode}, inplace=True)

    return df1, df2

In [68]:
def one_hot_encode(df1, df2):
    """
    One-hot encode categorical columns in both DataFrames, ensuring that all values present
    in both DataFrames are reflected in the final one-hot encoded DataFrames.
    """
    # Combine both DataFrames to get a union of all categories
    combined = pd.concat([df1, df2])

    # Identify the categorical columns to encode (excluding 'participant_id')
    categorical_cols = combined.select_dtypes(include=['int', 'float']).columns.tolist()

    # One-hot encode categorical columns (drop the first category to avoid collinearity)
    combined_encoded = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)
    combined_encoded = combined_encoded.map(lambda x: 1 if x is True else (0 if x is False else x))

    # Ensure the same columns in both df1 and df2
    # This will add any missing columns in each DataFrame, filling with NaN
    df1_encoded = combined_encoded.iloc[:len(df1)].reset_index(drop=True)
    df2_encoded = combined_encoded.iloc[len(df1):].reset_index(drop=True)

    # Return the one-hot encoded DataFrames
    return df1_encoded, df2_encoded

In [69]:
train_cat = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx')
test_cat = pd.read_excel('../../Data/widsdatathon2025/TEST/TEST_CATEGORICAL.xlsx')

In [70]:
print(train_cat.isna().any())

participant_id                      False
Basic_Demos_Enroll_Year             False
Basic_Demos_Study_Site              False
PreInt_Demos_Fam_Child_Ethnicity     True
PreInt_Demos_Fam_Child_Race         False
MRI_Track_Scan_Location             False
Barratt_Barratt_P1_Edu              False
Barratt_Barratt_P1_Occ              False
Barratt_Barratt_P2_Edu              False
Barratt_Barratt_P2_Occ              False
dtype: bool


In [71]:
print(test_cat.isna().any())

participant_id                      False
Basic_Demos_Enroll_Year             False
Basic_Demos_Study_Site              False
PreInt_Demos_Fam_Child_Ethnicity     True
PreInt_Demos_Fam_Child_Race          True
MRI_Track_Scan_Location             False
Barratt_Barratt_P1_Edu               True
Barratt_Barratt_P1_Occ               True
Barratt_Barratt_P2_Edu               True
Barratt_Barratt_P2_Occ               True
dtype: bool


In [72]:
train_cat_no_na, test_cat_no_na = impute_mode(train_cat, test_cat)
print(train_cat_no_na.isna().any())
print(test_cat_no_na.isna().any())

Imputing mode for columns: ['Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'PreInt_Demos_Fam_Child_Race', 'PreInt_Demos_Fam_Child_Ethnicity', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P2_Occ']
participant_id                      False
Basic_Demos_Enroll_Year             False
Basic_Demos_Study_Site              False
PreInt_Demos_Fam_Child_Ethnicity    False
PreInt_Demos_Fam_Child_Race         False
MRI_Track_Scan_Location             False
Barratt_Barratt_P1_Edu              False
Barratt_Barratt_P1_Occ              False
Barratt_Barratt_P2_Edu              False
Barratt_Barratt_P2_Occ              False
dtype: bool
participant_id                      False
Basic_Demos_Enroll_Year             False
Basic_Demos_Study_Site              False
PreInt_Demos_Fam_Child_Ethnicity    False
PreInt_Demos_Fam_Child_Race         False
MRI_Track_Scan_Location             False
Barratt_Barratt_P1_Edu              False
Barratt_Barratt_P1_Occ              False
Barratt_Barratt_P2_Edu           

In [73]:
train_cat_final, test_cat_final = one_hot_encode(train_cat_no_na, test_cat_no_na)

In [74]:
train_cat_final.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Enroll_Year_2021,Basic_Demos_Enroll_Year_2022,Basic_Demos_Enroll_Year_2023,Basic_Demos_Study_Site_2,...,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Occ_5.0,Barratt_Barratt_P2_Occ_10.0,Barratt_Barratt_P2_Occ_15.0,Barratt_Barratt_P2_Occ_20.0,Barratt_Barratt_P2_Occ_25.0,Barratt_Barratt_P2_Occ_30.0,Barratt_Barratt_P2_Occ_35.0,Barratt_Barratt_P2_Occ_40.0,Barratt_Barratt_P2_Occ_45.0
0,UmrK0vMLopoR,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,CPaeQkhcjg7d,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nb4EetVPm3gs,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,p4vPhVu91o4b,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M09PXs7arQ5E,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
test_cat_final.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Enroll_Year_2021,Basic_Demos_Enroll_Year_2022,Basic_Demos_Enroll_Year_2023,Basic_Demos_Study_Site_2,...,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Occ_5.0,Barratt_Barratt_P2_Occ_10.0,Barratt_Barratt_P2_Occ_15.0,Barratt_Barratt_P2_Occ_20.0,Barratt_Barratt_P2_Occ_25.0,Barratt_Barratt_P2_Occ_30.0,Barratt_Barratt_P2_Occ_35.0,Barratt_Barratt_P2_Occ_40.0,Barratt_Barratt_P2_Occ_45.0
0,Cfwaf5FX7jWK,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
2,ULliyEXjy4OV,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,LZfeAb1xMtql,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,EnFOUv0YK1RG,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1


## quant

In [80]:
def impute_mean(df1, df2):
    """
    Mode imputation for all columns with NaN values across two DataFrames.
    """
    # Find columns with NaN values in both DataFrames
    nan_cols_df1 = df1.columns[df1.isna().any()].tolist()
    nan_cols_df2 = df2.columns[df2.isna().any()].tolist()
    
    # Combine the columns with NaN values from both DataFrames
    all_nan_cols = list(set(nan_cols_df1 + nan_cols_df2))
    
    print(f"Imputing mode for columns: {all_nan_cols}")

    # Impute missing values with the mode for each column
    for col in all_nan_cols:
        combined_mode = pd.concat([df1[col], df2[col]]).mean()  # Compute the mode across both DataFrames
        df1.fillna({col: combined_mode}, inplace=True)
        df2.fillna({col: combined_mode}, inplace=True)

    return df1, df2

In [81]:
train_quant = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx')
test_quant = pd.read_excel('../../Data/widsdatathon2025/TEST/TEST_QUANTITATIVE_METADATA.xlsx')

In [82]:
train_quant.isna().any()

participant_id                False
EHQ_EHQ_Total                 False
ColorVision_CV_Score          False
APQ_P_APQ_P_CP                False
APQ_P_APQ_P_ID                False
APQ_P_APQ_P_INV               False
APQ_P_APQ_P_OPD               False
APQ_P_APQ_P_PM                False
APQ_P_APQ_P_PP                False
SDQ_SDQ_Conduct_Problems      False
SDQ_SDQ_Difficulties_Total    False
SDQ_SDQ_Emotional_Problems    False
SDQ_SDQ_Externalizing         False
SDQ_SDQ_Generating_Impact     False
SDQ_SDQ_Hyperactivity         False
SDQ_SDQ_Internalizing         False
SDQ_SDQ_Peer_Problems         False
SDQ_SDQ_Prosocial             False
MRI_Track_Age_at_Scan          True
dtype: bool

In [83]:
test_quant.isna().any()

participant_id                False
EHQ_EHQ_Total                  True
ColorVision_CV_Score           True
APQ_P_APQ_P_CP                 True
APQ_P_APQ_P_ID                 True
APQ_P_APQ_P_INV                True
APQ_P_APQ_P_OPD                True
APQ_P_APQ_P_PM                 True
APQ_P_APQ_P_PP                 True
SDQ_SDQ_Conduct_Problems       True
SDQ_SDQ_Difficulties_Total     True
SDQ_SDQ_Emotional_Problems     True
SDQ_SDQ_Externalizing          True
SDQ_SDQ_Generating_Impact      True
SDQ_SDQ_Hyperactivity          True
SDQ_SDQ_Internalizing          True
SDQ_SDQ_Peer_Problems          True
SDQ_SDQ_Prosocial              True
MRI_Track_Age_at_Scan         False
dtype: bool

In [84]:
train_quant_no_na, test_quant_no_na = impute_mean(train_quant, test_quant)

Imputing mode for columns: ['SDQ_SDQ_Prosocial', 'SDQ_SDQ_Difficulties_Total', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_PP', 'APQ_P_APQ_P_OPD', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'MRI_Track_Age_at_Scan', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_ID', 'ColorVision_CV_Score', 'SDQ_SDQ_Conduct_Problems', 'APQ_P_APQ_P_CP', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Generating_Impact', 'EHQ_EHQ_Total']


In [85]:
train_quant_no_na.isna().any()

participant_id                False
EHQ_EHQ_Total                 False
ColorVision_CV_Score          False
APQ_P_APQ_P_CP                False
APQ_P_APQ_P_ID                False
APQ_P_APQ_P_INV               False
APQ_P_APQ_P_OPD               False
APQ_P_APQ_P_PM                False
APQ_P_APQ_P_PP                False
SDQ_SDQ_Conduct_Problems      False
SDQ_SDQ_Difficulties_Total    False
SDQ_SDQ_Emotional_Problems    False
SDQ_SDQ_Externalizing         False
SDQ_SDQ_Generating_Impact     False
SDQ_SDQ_Hyperactivity         False
SDQ_SDQ_Internalizing         False
SDQ_SDQ_Peer_Problems         False
SDQ_SDQ_Prosocial             False
MRI_Track_Age_at_Scan         False
dtype: bool

In [86]:
test_quant_no_na.isna().any()

participant_id                False
EHQ_EHQ_Total                 False
ColorVision_CV_Score          False
APQ_P_APQ_P_CP                False
APQ_P_APQ_P_ID                False
APQ_P_APQ_P_INV               False
APQ_P_APQ_P_OPD               False
APQ_P_APQ_P_PM                False
APQ_P_APQ_P_PP                False
SDQ_SDQ_Conduct_Problems      False
SDQ_SDQ_Difficulties_Total    False
SDQ_SDQ_Emotional_Problems    False
SDQ_SDQ_Externalizing         False
SDQ_SDQ_Generating_Impact     False
SDQ_SDQ_Hyperactivity         False
SDQ_SDQ_Internalizing         False
SDQ_SDQ_Peer_Problems         False
SDQ_SDQ_Prosocial             False
MRI_Track_Age_at_Scan         False
dtype: bool

In [88]:
def normalize_two_dfs(df1, df2):
    # Extract numeric columns (excluding the first column)
    numeric_cols = df1.select_dtypes(include=['number']).columns  # Assumes same structure

    # Combine both DataFrames to compute global min/max
    combined = pd.concat([df1[numeric_cols], df2[numeric_cols]])

    # Compute global min/max
    min_ = combined.min()
    max_ = combined.max()

    # Apply normalization
    df1_normalized = df1.copy()
    df2_normalized = df2.copy()
    df1_normalized[numeric_cols] = df1[numeric_cols].apply(lambda x: (x - min_[x.name]) / (max_[x.name] - min_[x.name]))
    df2_normalized[numeric_cols] = df2[numeric_cols].apply(lambda x: (x - min_[x.name]) / (max_[x.name] - min_[x.name]))

    return df1_normalized, df2_normalized

In [89]:
train_quant_final, test_quant_final = normalize_two_dfs(train_quant_no_na, test_quant_no_na)

In [90]:
train_quant_final.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,0.6999,0.928571,0.25,0.357143,0.94,0.464286,0.297297,0.933333,0.0,0.176471,0.1,0.25,0.0,0.5,0.058824,0.0,1.0,0.521017
1,CPaeQkhcjg7d,0.027886,1.0,0.25,0.464286,0.68,0.642857,0.621622,1.0,0.0,0.529412,0.6,0.4,0.7,0.8,0.588235,0.444444,0.5,0.521017
2,Nb4EetVPm3gs,0.266767,1.0,0.333333,0.357143,0.7,0.571429,0.27027,0.966667,0.1,0.411765,0.2,0.4,0.5,0.7,0.352941,0.444444,0.9,0.382106
3,p4vPhVu91o4b,0.366667,0.714286,0.416667,0.428571,0.78,0.678571,0.432432,0.933333,0.6,0.705882,0.4,0.8,0.9,1.0,0.470588,0.444444,0.6,0.521017
4,M09PXs7arQ5E,0.5,1.0,0.416667,0.535714,0.8,0.714286,0.648649,0.933333,0.1,0.529412,0.4,0.55,0.4,1.0,0.411765,0.333333,0.9,0.414603


In [91]:
test_quant_final.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0.8,1.0,0.416667,0.571429,0.82,0.678571,0.297297,0.866667,0.2,0.352941,0.3,0.45,0.2,0.7,0.176471,0.0,0.8,0.41702
1,vhGrzmvA3Hjq,0.933333,0.857143,0.25,0.464286,0.86,0.642857,0.405405,0.933333,0.2,0.470588,0.8,0.25,0.7,0.3,0.647059,0.333333,0.9,0.5715
2,ULliyEXjy4OV,0.633333,0.928571,0.25,0.5,0.72,0.571429,0.378378,0.833333,0.1,0.205882,0.1,0.3,0.1,0.5,0.058824,0.0,0.9,0.360358
3,LZfeAb1xMtql,0.966667,0.928571,0.25,0.678571,0.82,0.607143,0.486486,0.9,0.4,0.441176,0.4,0.5,0.8,0.6,0.294118,0.111111,0.6,0.431489
4,EnFOUv0YK1RG,0.033333,1.0,0.25,0.464286,0.84,0.678571,0.432432,0.933333,0.2,0.529412,0.6,0.6,0.5,1.0,0.352941,0.0,1.0,0.3831


In [92]:
train_final = pd.merge(train_quant_final, train_cat_final, on='participant_id', how='left')
test_final = pd.merge(test_quant_final, test_cat_final, on='participant_id', how='left')

In [93]:
train_final.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Occ_5.0,Barratt_Barratt_P2_Occ_10.0,Barratt_Barratt_P2_Occ_15.0,Barratt_Barratt_P2_Occ_20.0,Barratt_Barratt_P2_Occ_25.0,Barratt_Barratt_P2_Occ_30.0,Barratt_Barratt_P2_Occ_35.0,Barratt_Barratt_P2_Occ_40.0,Barratt_Barratt_P2_Occ_45.0
0,UmrK0vMLopoR,0.6999,0.928571,0.25,0.357143,0.94,0.464286,0.297297,0.933333,0.0,...,1,0,0,0,0,0,0,0,0,1
1,CPaeQkhcjg7d,0.027886,1.0,0.25,0.464286,0.68,0.642857,0.621622,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Nb4EetVPm3gs,0.266767,1.0,0.333333,0.357143,0.7,0.571429,0.27027,0.966667,0.1,...,0,0,0,0,0,0,0,0,0,0
3,p4vPhVu91o4b,0.366667,0.714286,0.416667,0.428571,0.78,0.678571,0.432432,0.933333,0.6,...,0,0,0,0,0,0,0,0,0,0
4,M09PXs7arQ5E,0.5,1.0,0.416667,0.535714,0.8,0.714286,0.648649,0.933333,0.1,...,0,0,0,0,0,0,0,0,0,0


In [94]:
test_final.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Occ_5.0,Barratt_Barratt_P2_Occ_10.0,Barratt_Barratt_P2_Occ_15.0,Barratt_Barratt_P2_Occ_20.0,Barratt_Barratt_P2_Occ_25.0,Barratt_Barratt_P2_Occ_30.0,Barratt_Barratt_P2_Occ_35.0,Barratt_Barratt_P2_Occ_40.0,Barratt_Barratt_P2_Occ_45.0
0,Cfwaf5FX7jWK,0.8,1.0,0.416667,0.571429,0.82,0.678571,0.297297,0.866667,0.2,...,0,0,0,0,0,0,1,0,0,0
1,vhGrzmvA3Hjq,0.933333,0.857143,0.25,0.464286,0.86,0.642857,0.405405,0.933333,0.2,...,1,0,0,0,0,0,1,0,0,0
2,ULliyEXjy4OV,0.633333,0.928571,0.25,0.5,0.72,0.571429,0.378378,0.833333,0.1,...,0,0,0,0,0,0,0,0,1,0
3,LZfeAb1xMtql,0.966667,0.928571,0.25,0.678571,0.82,0.607143,0.486486,0.9,0.4,...,1,0,0,0,0,0,0,0,0,1
4,EnFOUv0YK1RG,0.033333,1.0,0.25,0.464286,0.84,0.678571,0.432432,0.933333,0.2,...,1,0,0,0,0,0,0,0,0,1


In [95]:
train_final.to_csv('../../Data/preprocessed/train/aux.csv', index=False)
test_final.to_csv('../../Data/preprocessed/test/aux.csv', index=False)

# labels from excel to csv

In [96]:
train_y = pd.read_excel('../../Data/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx')

In [97]:
train_y.to_csv('../../Data/preprocessed/train/labels.csv', index=False)