In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/MLA4H/project/notebook

/content/drive/My Drive/MLA4H/project_final/notebook


In [3]:
# Import libraries
import os
import pandas as pd
import numpy as np

# 1. Read Raw CSV file

In [11]:
df = pd.read_csv('../data/raw_Ost_LabEvents.csv')
df.tail(5)

Unnamed: 0,subject_id,avg_UrineCreatinine1,avg_UrineCreatinine2,avg_UrineCreatinine3,avg_UrineCalcium,avg_ParathyroidHormone,gender,anchor_age,Osteoporosis
49742,19824731,,45.0,,12.7,58.428571,F,66,0
49743,13543245,,173.0,,,,F,91,0
49744,13774741,,124.666667,,,,F,91,0
49745,14476240,,35.666667,,,,F,64,0
49746,18539655,,133.0,,,,M,69,0


# 2. Processing Categorical Data

In [12]:
df['gender'] = df['gender'].map({'M': 1, 'F': 0})

# 3. Replacing NaN Values

In [13]:
print(df.isna().sum())

subject_id                    0
avg_UrineCreatinine1      49400
avg_UrineCreatinine2       4675
avg_UrineCreatinine3      49747
avg_UrineCalcium          46754
avg_ParathyroidHormone    35575
gender                        0
anchor_age                    0
Osteoporosis                  0
dtype: int64


In [14]:
# fill nan values with column median
df.fillna(df.median(), inplace=True)
df.dropna(axis=1, how='all', inplace=True)

In [15]:
# Average all the UrineCreatinine columns
df['avg_UrineCreatinine'] = df[['avg_UrineCreatinine1', 'avg_UrineCreatinine2']].mean(axis=1, skipna=True)

# Create CCR column based on UrineCalcium and UrineCreatinine columns
df['CCR'] = np.where(df[['avg_UrineCalcium', 'avg_UrineCreatinine']].isna().any(axis=1), np.nan, df['avg_UrineCalcium'] / df['avg_UrineCreatinine'])

# Rename column names for simplicity
df = df.rename(columns={"avg_ParathyroidHormone": "PTH"})

# Create binary columns based on clinical thresholds for CCR and PTH
df['CCR>0.2'] = np.where(df['CCR']>0.2, 1, 0)
df['PTH>65'] = np.where(df['PTH']>65, 1, 0)

# Final Selection of Columns
df = df[['gender', 'anchor_age', 'CCR','CCR>0.2', 'PTH', 'PTH>65', 'Osteoporosis']]

In [16]:
df.tail()

Unnamed: 0,gender,anchor_age,CCR,CCR>0.2,PTH,PTH>65,Osteoporosis
49742,0,66,0.233028,1,58.428571,0,0
49743,0,91,0.044726,0,61.5,0,0
49744,0,91,0.056184,0,61.5,0,0
49745,0,64,0.106355,0,61.5,0,0
49746,1,69,0.053807,0,61.5,0,0


In [18]:
# Save DataFrame to CSV
df.to_csv('../data/processed_Ost_LabEvents.csv', index=False)