In [19]:
import pandas as pd

# Read the Excel file (no need to specify custom na_values)
df = pd.read_csv('../../data_set/monthlyInvestigation/monthly_investigations.csv')

df.drop(columns=['HCO3 - pre HD (mmol/L)', 'HCO3 -post HD (mmol/L)', 'HbA1C (%)', 'PTH', 'Serum ferritin' , 'Serum iron', 'TSAT', 'Vit D' ,'blood  picture'], inplace=True)

# Columns to exclude
excluded_columns = ['Subject_ID', 'Month']

# Columns to clean
columns_to_clean = [col for col in df.columns if col not in excluded_columns]

# Convert non-numeric values to NaN in the target columns
df[columns_to_clean] = df[columns_to_clean].apply(pd.to_numeric, errors='coerce')

# Drop rows where all target columns are zero (ignoring NaN)
df = df[~(df[columns_to_clean].fillna(0) == 0).all(axis=1)]


In [20]:
id_column = 'Subject_ID'

# Identify columns to interpolate (exclude Subject_ID and Month)
columns_to_interpolate = [col for col in df.columns if col not in ['Subject_ID', 'Month']]

# Interpolate within each patient group
df[columns_to_interpolate] = (
    df.groupby(id_column)[columns_to_interpolate]
    .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
    .reset_index(drop=True)
)

In [21]:
df.to_excel('cleaned_monthly_investigations.xlsx', index=False)