### Datasets updates July 2025 ###

In [1]:
import os
import pandas as pd
import numpy as np
import logging

logger = logging.getLogger(__name__)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt

In [3]:
# Files and directories
data_dir = os.path.join(os.environ.get('DATA'), 'hcp')

# Updated training set
trainset_file_name = 'cleaned_testing_set.xlsx'

# Old data set with training data
hcp_dataset_file_name = 'hcp-alldata-250413.parquet'

In [4]:
# Load the new training data
df_train_raw = pd.read_excel(os.path.join(data_dir, trainset_file_name))

# Let's get rid of the old columns
df_train = df_train_raw.\
    drop(['mental_health_ref', 'inpatient_ref', 'outpatient_ref'], axis=1).\
    rename(columns={'mental_health_ref_cleaned': 'mental_health',
                    'inpatient_ref_cleaned': 'inpatient',
                    'outpatient_ref_cleaned': 'outpatient'}).\
    assign(dset='train').reset_index(drop=True)

display(df_train.head())
print(df_train.shape)
print(len(df_train['id'].unique()))

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1,1,1,train
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1,1,1,train
2,431643-07,Actriv,Provider of healthcare staffing services based...,1,0,0,train
3,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train
4,107240-50,Alvarado Parkway Institute,Operator of a psychiatric health care facility...,1,1,1,train


(187, 7)
187


In [5]:
# Open the old data sets
df_test_raw = pd.read_parquet(os.path.join(data_dir, hcp_dataset_file_name))
print(df_test_raw.shape)
print(len(df_test_raw['id'].unique()))

# Remove the training data
df_test = df_test_raw.loc[~df_test_raw['id'].isin(df_train['id'].unique())].\
                reset_index(drop=True)
display(df_test.head())
print(df_test.shape)
print(len(df_test['id'].unique()))

(2025, 7)
2025


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,525519-64,10-4 Medical,Provider focused on whole-person healthcare in...,,,,test
1,111309-13,12 Keys Rehab,Provider of rehabilitation services intended t...,,,,test
2,107097-94,1321 Investors,Provider of healthcare services. The company o...,,,,test
3,167073-22,180 Health Partners,Provider of healthcare support services intend...,,,,test
4,224481-61,181st Street Urgent Care Center,Provider of medical services intended to treat...,,,,test


(1838, 7)
1838


In [6]:
# Save the data sets
train_set_name = 'hcp-train-250701.parquet'
test_set_name = 'hcp-test-250701.parquet'
df_train.to_parquet(os.path.join(data_dir, train_set_name))
df_test.to_parquet(os.path.join(data_dir, test_set_name))