In [16]:
import pandas as pd

df = pd.read_excel('../../data_set/hd_sessions/restructured_hd_session_data.xlsx')
df.drop(columns=['BP (mmHg)'], inplace=True)

In [22]:
# Columns to interpolate
columns_to_interpolate = [
    'AP (mmHg)', 'AUF (ml)', 'BFR (ml/min)', 'Dry weight (kg)', 'HD duration (h)',
    'PUF (ml)', 'Post HD weight (kg)', 'Pre HD weight (kg)',
    'TMP (mmHg)', 'VP (mmHg)', 'Weight gain (kg)', 'SYS (mmHg)', 'DIA (mmHg)'
]

# Ensure required columns exist
if 'Subject_ID' not in df.columns or 'Session_No' not in df.columns:
    raise ValueError("The dataset must contain both 'Patient ID' and 'Session_No' columns.")

# Convert session numbers to integers for sorting
df['Session_No_numeric'] = df['Session_No'].str.extract(r'(\d+)').astype(int)

In [24]:
import re


def convert_to_hours(val):
    """Convert strings like '3h 15min', '40min' to hours as float"""
    if pd.isna(val):
        return None
    try:
        # If already numeric, return as float
        return float(val)
    except:
        val = str(val).lower().strip()
        hours = 0
        minutes = 0

        # Extract hours and minutes using regex
        hour_match = re.search(r'(\d+)\s*h', val)
        minute_match = re.search(r'(\d+)\s*min', val)

        if hour_match:
            hours = int(hour_match.group(1))
        if minute_match:
            minutes = int(minute_match.group(1))

        return round(hours + minutes / 60, 2)

# Process the HD duration (h) column
if 'HD duration (h)' in df.columns:
    df['HD duration (h)'] = df['HD duration (h)'].apply(convert_to_hours)

    # Replace NaN with rounded mean
    mean_val = round(df['HD duration (h)'].mean(), 1)
    df['HD duration (h)'] = df['HD duration (h)'].fillna(mean_val)

    # round all values to 1 decimal place
    df['HD duration (h)'] = df['HD duration (h)'].round(1)
else:
    print("Column 'HD duration (h)' not found in the dataset.")

In [26]:
# Convert target columns to numeric (non-numeric values → NaN)
for col in columns_to_interpolate:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        print(f"Warning: Column '{col}' not found in dataset and will be skipped.")

# Sort by Patient ID and numeric session number
df = df.sort_values(by=['Subject_ID', 'Session_No_numeric'])

df

Unnamed: 0,Subject_ID,Session_No,Date,AP (mmHg),AUF (ml),BFR (ml/min),Dry weight (kg),HD duration (h),PUF (ml),Post HD weight (kg),Pre HD weight (kg),TMP (mmHg),VP (mmHg),Weight gain (kg),SYS (mmHg),DIA (mmHg),Session_No_numeric
0,RHD_THP_001,Session 1,2024-02-01,-143.0,1000.0,200.0,68.5,4.0,1600.0,68.5,70.1,43.0,126.0,1.6,134.0,73.0,1
1,RHD_THP_001,Session 2,2024-05-01,-164.0,3300.0,180.0,68.5,4.0,3300.0,68.4,71.8,34.0,160.0,3.4,109.0,68.0,2
2,RHD_THP_001,Session 3,2024-09-01,-166.0,3000.0,200.0,68.5,4.0,3000.0,68.7,71.5,32.0,123.0,2.8,141.0,76.0,3
3,RHD_THP_001,Session 4,2024-12-01,-142.0,3300.0,250.0,68.5,4.0,3300.0,68.7,71.8,40.0,134.0,3.1,139.0,75.0,4
4,RHD_THP_001,Session 5,2024-01-16,-126.0,2000.0,250.0,68.5,4.0,2000.0,68.5,70.5,42.0,154.0,2.0,113.0,70.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3851,RHD_THP_045,Session 54,2024-12-17,-96.0,1900.0,250.0,85.0,4.0,1900.0,84.8,86.9,48.0,130.0,2.1,201.0,73.0,54
3852,RHD_THP_045,Session 55,2024-12-20,-110.0,1600.0,250.0,85.0,4.0,1600.0,84.0,86.6,42.0,127.0,2.6,220.0,80.0,55
3853,RHD_THP_045,Session 56,2024-12-24,-100.0,2066.7,250.0,84.5,4.0,2900.0,84.2,87.4,65.0,115.0,3.2,170.0,82.0,56
3854,RHD_THP_045,Session 57,2024-12-27,-98.0,2533.3,250.0,84.5,4.0,3100.0,84.4,87.6,44.0,130.0,3.2,203.0,80.0,57


In [None]:
# Interpolate missing values per patient
df[columns_to_interpolate] = (
    df.groupby('Subject_ID')[columns_to_interpolate]
    .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
    .reset_index(drop=True)
    .round(1)
)

# Drop the helper column if no longer needed
df.drop(columns=['Session_No_numeric'], inplace=True)

# df.to_excel('data_filled_missing_values.xlsx', index=False)