In [1]:
import sys
sys.path.append("/home/bw720/nde_traj/src")
import os
import time
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from loguru import logger

from config import feat_params
from config import DATA_DIR, FEAT_DIR
from utils.general_functs import save_pickle, load_pickle
from features.feature_utils import combine_df
from features.feature_compute import compute_features


t0 = time.time()
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--cohort', type=str, help='Cohort selection', default='MDD')
parser.add_argument('-df', '--data_folder', type=str, help='Data folder name', default="random_visit_w_filters_all_matched")
parser.add_argument('-ff', '--feat_folder', type=str, help='Feature folder name', default="random_visit_w_filters_all_matched")
parser.add_argument('-lw', '--lookback_window', type=int, help='Lookback window', default=1)
args, unkwn = parser.parse_known_args()

In [2]:
DATA_path = os.path.join(DATA_DIR, args.cohort.upper(), args.data_folder)
FEAT_path = os.path.join(FEAT_DIR, args.cohort.upper(), args.feat_folder)

train_data_filename = f"RPDRml__{args.cohort.upper()}_train.{args.lookback_window}y.pk"
val_data_filename = f"RPDRml__{args.cohort.upper()}_val.{args.lookback_window}y.pk"
test_data_filename = f"RPDRml__{args.cohort.upper()}_test.{args.lookback_window}y.pk"
train_feat_filename = f"train.{feat_params['feat_type']}.{args.lookback_window}y.pk"
val_feat_filename = f"val.{feat_params['feat_type']}.{args.lookback_window}y.pk"
test_feat_filename = f"test.{feat_params['feat_type']}.{args.lookback_window}y.pk"
train_val_feat_filename = f"train_val.{feat_params['feat_type']}.{args.lookback_window}y.pk"
featname_filename = f"feature_names.{feat_params['feat_type']}.{args.lookback_window}y.pk"

trainpath = os.path.join(DATA_path, train_data_filename)
valpath = os.path.join(DATA_path, val_data_filename)
testpath = os.path.join(DATA_path, test_data_filename)
train_featpath = os.path.join(FEAT_path, train_feat_filename)
val_featpath = os.path.join(FEAT_path, val_feat_filename)
test_featpath = os.path.join(FEAT_path, test_feat_filename)
train_val_featpath = os.path.join(FEAT_path, train_val_feat_filename)
featname_path = os.path.join(FEAT_path, featname_filename)

logger.info("Loading training, validation & test data frames")
print("Loading from", trainpath)
df_train = load_pickle(trainpath)
print("Loading from", valpath)
df_val = load_pickle(valpath)
print("Loading from", testpath)
df_test = load_pickle(testpath)
print("Training data size:", len(df_train))
print(df_train.label.value_counts())
print("Validation data size:", len(df_val))
print(df_val.label.value_counts())
print("Test data size:", len(df_test))
print(df_test.label.value_counts())

[32m2025-10-24 18:22:15.596[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mLoading training, validation & test data frames[0m


Loading from /home/bw720/nde_traj/data/MDD/random_visit_w_filters_all_matched/RPDRml__MDD_train.1y.pk
Loading from /home/bw720/nde_traj/data/MDD/random_visit_w_filters_all_matched/RPDRml__MDD_val.1y.pk
Loading from /home/bw720/nde_traj/data/MDD/random_visit_w_filters_all_matched/RPDRml__MDD_test.1y.pk
Training data size: 207383
label
1    103692
0    103691
Name: count, dtype: int64
Validation data size: 44439
label
0    22220
1    22219
Name: count, dtype: int64
Test data size: 44440
label
0    22220
1    22220
Name: count, dtype: int64


In [5]:
pids = []
labels = []
phecounts = []
for i, row in df_train.iterrows():
    phecount = []
    for dx_codes in row['dx_codes']:
        phecount.append(dx_codes.count("PheCode:296.2"))
        
    pids.append(row['subject_num'])
    phecounts.append(phecount)
    labels.append(row['label'])

df = pd.DataFrame()
df['subject_num'] = pids
df['count'] = phecounts
df['label'] = labels
df['max_count'] = [max(c) for c in phecounts]

In [3]:
logger.info("Combining codes and dates")
df_train = combine_df(df_train)
df_val = combine_df(df_val)
df_test = combine_df(df_test)

[32m2025-10-08 23:17:16.549[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mCombining codes and dates[0m


In [4]:
logger.info("Computing features")
X_train, X_test, feat_names = compute_features(
    df_train, df_test, 
    feat_type=feat_params["feat_type"], 
    feat_transform=feat_params["feat_transform"],
    feat_lvl="visit", # visit-level fix-sized feature representation
    feat_scaling=feat_params["feat_scaling"],
    max_df=feat_params["max_df"], 
    min_df=feat_params["min_df"]
)

[32m2025-10-08 23:17:39.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mComputing features[0m


Computing DEMO and DX and MED BOW features per visit
Feature size = 2066 dimensions


In [5]:
X_train[0]

<1x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [6]:
X_train[1]

<4x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [7]:
X_test[0]

<1x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [8]:
feat_names.shape

(2066,)

In [9]:
X_train2, X_val, feat_names2 = compute_features(
    df_train, df_val, 
    feat_type=feat_params["feat_type"], 
    feat_transform=feat_params["feat_transform"],
    feat_lvl=feat_params["feat_lvl"], # visit-level fix-sized feature representation
    feat_scaling=feat_params["feat_scaling"],
    max_df=feat_params["max_df"], 
    min_df=feat_params["min_df"]
)

Computing DEMO and DX and MED BOW features per visit
Feature size = 2066 dimensions


In [10]:
X_train2[0]

<1x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [11]:
X_train2[1]

<4x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [12]:
X_val[0]

<10x2066 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [13]:
feat_names2.shape

(2066,)

In [22]:
feat_names[:30]

array(['gender_value_F', 'gender_value_M', 'gender_value_U',
       'race_value_Asian', 'race_value_Black', 'race_value_Other',
       'race_value_Unknown', 'race_value_White',
       'ethnicity_value_Non-Hispanic', 'age_0-<18', 'age_18-<25',
       'age_25-<40', 'age_40-<60', 'age_60-<75', 'age_75+', 'PheCode:008',
       'PheCode:008.5', 'PheCode:008.52', 'PheCode:008.6', 'PheCode:010',
       'PheCode:038', 'PheCode:038.1', 'PheCode:038.2', 'PheCode:038.3',
       'PheCode:041', 'PheCode:041.1', 'PheCode:041.11', 'PheCode:041.12',
       'PheCode:041.2', 'PheCode:041.4'], dtype=object)

In [23]:
feat_names2[:20]

array(['gender_value_F', 'gender_value_M', 'gender_value_U',
       'gender_value_X', 'race_value_Asian', 'race_value_Black',
       'race_value_Other', 'race_value_Unknown', 'race_value_White',
       'ethnicity_value_Non-Hispanic', 'age_0-<18', 'age_18-<25',
       'age_25-<40', 'age_40-<60', 'age_60-<75', 'age_75+', 'PheCode:008',
       'PheCode:008.5', 'PheCode:008.52', 'PheCode:008.6'], dtype=object)