In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, gc, os

In [3]:
VER = 1          # VERSION NAME FOR SAVED MODEL FILES
SEED = 42        # TRAIN RANDOM SEED
NAN_VALUE = -127 # FILL NAN VALUE - will fit in int8
FOLDS = 5        # FOLDS PER MODEL

# Process and Feature Engineering

## 1. Join Data
Merge `train_features.csv`, `train_outcomes_functional.csv`, and `metadata.csv` on `PID` to create a single dataset.

## 2. Filter Target Time
The `modben` values are associated with `time = 26`. Filter for `time == 26` in the outcomes and match this with features for consistent samples.

## 3. Do any other feature engineering

In [7]:
data_raw_path = "../data-raw"

train_features_csv = f"{data_raw_path}/train_features.csv"
train_outcomes_functional_csv = f"{data_raw_path}/train_outcomes_functional.csv"
metadata_csv = f"{data_raw_path}/metadata.csv"

In [8]:
train_features_df = pd.read_csv(train_features_csv)
train_outcomes_df = pd.read_csv(train_outcomes_functional_csv)
metadata_df = pd.read_csv(metadata_csv)

In [9]:
training_df = train_features_df.merge(train_outcomes_df, on="PID", how="inner")
full_training_df = training_df.merge(metadata_df, on="PID", how="inner")

In [11]:
training_df.shape

(582, 543)

In [12]:
full_training_df.shape

(582, 556)

In [13]:
full_training_df.head()

Unnamed: 0,PID,elbfll01,wrextl01,elbexl01,finfll01,finabl01,hipfll01,kneexl01,ankdol01,gretol01,...,tx1_r,srdecc1,surgcd1,spcsuc1,scdecc1,hemccd1,mhpsyccd,mhneurcd,mhcardcd,mhmetacd
0,PID_62,5.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,D1,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0
1,PID_148,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,...,P,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PID_508,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,P,,,,,,0.0,0.0,0.0,0.0
3,PID_254,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,...,D2,,,,,,0.0,0.0,0.0,0.0
4,PID_189,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,...,P,0.0,1.0,,0.0,1.0,0.0,1.0,0.0,0.0


In [14]:
full_training_df = full_training_df.sort_values(by='PID')

In [16]:
full_training_df.head()

Unnamed: 0,PID,elbfll01,wrextl01,elbexl01,finfll01,finabl01,hipfll01,kneexl01,ankdol01,gretol01,...,tx1_r,srdecc1,surgcd1,spcsuc1,scdecc1,hemccd1,mhpsyccd,mhneurcd,mhcardcd,mhmetacd
313,PID_0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,D2,,,,,,0.0,0.0,1.0,0.0
234,PID_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,D1,0.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0
150,PID_10,5.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,P,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
183,PID_102,5.0,5.0,5.0,5.0,5.0,0.0,0.0,,0.0,...,P,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,PID_105,4.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,D2,,,,,,0.0,0.0,0.0,0.0


## Subsetting only week 1 variables

Since the `modben` is only determined for week 1, we will only extract week 1 outcome variables in addition to fixed variables.

In [27]:
target_variables = ["PID", "modben", "age_category", "sexcd", "bmi_category", "srdecc1", "surgcd1", "spcsuc1", "scdecc1", 
                    "hemccd1", "mhpsyccd", "mhneurcd", "mhcardcd", "mhmetacd", "tx1_r", "ais1", "elbfll01", "wrextl01", 
                    "elbexl01", "finfll01", "finabl01", "hipfll01", "kneexl01", "ankdol01", "gretol01", "ankpll01", 
                    "elbflr01", "wrextr01", "elbexr01", "finflr01", "finabr01", "hipflr01", "kneetr01", "ankdor01", 
                    "gretor01", "ankplr01", "c2ltl01", "c3ltl01", "c4ltl01", "c5ltl01", "c6ltl01", "c7ltl01", "c8ltl01", 
                    "t1ltl01", "t2ltl01", "t3ltl01", "t4ltl01", "t5ltl01", "t6ltl01", "t7ltl01", "t8ltl01", "t9ltl01", 
                    "t10ltl01", "t11ltl01", "t12ltl01", "l1ltl01", "l2ltl01", "l3ltl01", "l4ltl01", "l5ltl01", "s1ltl01", 
                    "s2ltl01", "s3ltl01", "s45ltl01", "c2ltr01", "c3ltr01", "c4ltr01", "c5ltr01", "c6ltr01", "c7ltr01", 
                    "c8ltr01", "t1ltr01", "t2ltr01", "t3ltr01", "t4ltr01", "t5ltr01", "t6ltr01", "t7ltr01", "t8ltr01", 
                    "t9ltr01", "t10ltr01", "t11ltr01", "t12ltr01", "l1ltr01", "l2ltr01", "l3ltr01", "l4ltr01", "l5ltr01", 
                    "s1ltr01", "s2ltr01", "s3ltr01", "s45ltr01", "c2ppl01", "c3ppl01", "c4ppl01", "c5ppl01", "c6ppl01", 
                    "c7ppl01", "c8ppl01", "t1ppl01", "t2ppl01", "t3ppl01", "t4ppl01", "t5ppl01", "t6ppl01", "t7ppl01", 
                    "t8ppl01", "t9ppl01", "t10ppl01", "t11ppl01", "t12ppl01", "l1ppl01", "l2ppl01", "l3ppl01", "l4ppl01", 
                    "l5ppl01", "s1ppl01", "s2ppl01", "s3ppl01", "s45ppl01", "c2ppr01", "c3ppr01", "c4ppr01", "c5ppr01", 
                    "c6ppr01", "c7ppr01", "c8ppr01", "t1ppr01", "t2ppr01", "t3ppr01", "t4ppr01", "t5ppr01", "t6ppr01", 
                    "t7ppr01", "t8ppr01", "t9ppr01", "t10ppr01", "t11ppr01", "t12ppr01", "l1ppr01", "l2ppr01", "l3ppr01", 
                    "l4ppr01", "l5ppr01", "s1ppr01", "s2ppr01", "s3ppr01", "s45ppr01"]


In [28]:
training_df = full_training_df[target_variables]

In [33]:
# Fill NA values
training_df = training_df.fillna(NAN_VALUE).copy()

In [34]:
training_df.shape

(582, 148)

In [35]:
training_df.head()

Unnamed: 0,PID,modben,age_category,sexcd,bmi_category,srdecc1,surgcd1,spcsuc1,scdecc1,hemccd1,...,t12ppr01,l1ppr01,l2ppr01,l3ppr01,l4ppr01,l5ppr01,s1ppr01,s2ppr01,s3ppr01,s45ppr01
313,PID_0,3.0,>65,2,Healthy,-127.0,-127.0,-127.0,-127.0,-127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234,PID_1,1.0,<65,2,Healthy,0.0,1.0,-127.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,PID_10,7.0,<65,1,Healthy,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0
183,PID_102,1.0,<65,2,Overweight,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,PID_105,1.0,<65,2,Healthy,-127.0,-127.0,-127.0,-127.0,-127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature engineering

For the feature engineering, we do this by aggregating numerical and categorical features over the `PID` identifier.

In [46]:
def process_and_feature_engineering(df):
    """
    Perform feature engineering by aggregating numerical and categorical features 
    over the 'PID' identifier. This function:
    
    1. Identifies and separates categorical and numerical features.
    2. Aggregates numerical features using mean, std, min, max, and last observed value.
    3. Aggregates categorical features using count, last observed value, and number of unique values.
    4. Concatenates aggregated features into a single DataFrame.

    Parameters
    ==========
    df (pd.DataFrame): Input DataFrame containing patient-level records with a 'PID' column and features.

    Returns
    =======
    pd.DataFrame: Aggregated DataFrame with engineered features indexed by 'PID'.
    """

    # Exclude identifier and target from the feature list
    all_cols = [col for col in list(df.columns) if col not in ["PID", "modben"]]

    # Predefined list of categorical features
    category_feats = ["age_category", "sexcd", "bmi_category", "srdecc1", "surgcd1", "spcsuc1", "scdecc1", 
                      "hemccd1", "mhpsyccd", "mhneurcd", "mhcardcd", "mhmetacd", "tx1_r", "ais1"]

    # Treat remaining features as numeric
    num_feats = [col for col in all_cols if col not in category_feats]

    # Aggregate numeric features
    test_num_agg = df.groupby("PID")[num_feats].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    # Aggregate categorical features
    test_cat_agg = df.groupby("PID")[category_feats].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    # Combine numeric and categorical aggregates, and clean up.
    df = pd.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print(f"Shape after engineering: {df.shape}")
    
    return df

In [44]:
train = process_and_feature_engineering(training_df)

TypeError: agg function failed [how->mean,dtype->object]