# Predictive Modeling of Hospital Length of Stay and Discharge Type
# [Step 3: Merge laboratory and clinical data]

## 1. Import libraries and load datasets

In [1]:
# Import data manipulation library
import pandas as pd

In [3]:
# Define file paths for cleaned input datasets  
lab_data_path = "/home/anna/Desktop/Master_thesis/output_data/cleaned_lab_data" 
clinical_data_path = "/home/anna/Desktop/Master_thesis/output_data/cleaned_clinical_data"  

# Define output path for merged dataset  
merged_data_path = "/home/anna/Desktop/Master_thesis/output_data/merged_data"  

In [4]:
# Load lab dataset 
lab_data = pd.read_csv(lab_data_path)

In [5]:
display(lab_data)

Unnamed: 0,patient_id,case_id,test_name,test_abbr,method_number,numeric_result,text_result,unit
0,1,171465,Natrium,Na,1,138.0,138,mmol/L
1,1,171465,Kalium,KA,3,4.6,4.6,mmol/L
2,1,171465,Hämolytisch,H-Se,42,4.0,4,Unknown
3,1,171465,Lipämisch,L-Se,43,3.0,3,Unknown
4,1,171465,Ikterisch,I-Se,44,1.0,1,Unknown
...,...,...,...,...,...,...,...,...
18913455,240990,415184,MCHC,MCHCn,9116,333.0,333,g/L
18913456,240990,415184,RDW,RDWn,9117,14.3,14.3,%
18913457,240990,415184,Thrombozyten,THZn,9119,226.0,226,G/L
18913458,240990,415184,MPV,MPVn,9204,9.0,9.0,fL


In [6]:
# Load and display clinical dataset
clinical_data = pd.read_csv(clinical_data_path)
display(clinical_data)

Unnamed: 0,patient_id,case_id,discharge_type,sex,age,length_of_stay_days,diagnosis,diagnosis_category
0,1,171465,Entlassung,f,73,28,A04.70,Infectious diseases
1,1,333396,Entlassung,f,73,34,K57.22,Digestive diseases
2,2,27091,Entlassung,m,51,13,T84.5,Injury & poisoning
3,2,36154,Entl.ext.Instit,m,50,12,I21.4,Circulatory diseases
4,2,142617,Entl. in ex.KH,m,51,1,T84.5,Injury & poisoning
...,...,...,...,...,...,...,...,...
273558,240988,393440,Entlassung,m,79,2,K55.21,Digestive diseases
273559,240988,412516,Entlassung,m,79,10,I11.00,Circulatory diseases
273560,240988,425806,Entl.ext.Instit,m,79,7,M16.7,Musculoskeletal diseases
273561,240989,393141,Entlassung,m,62,1,I49.3,Circulatory diseases


## 2. Filter out lab tests with more than 80% missing data  

In [7]:
## Select most frequent lab tests based on case occurrence

# Count the number of unique cases per lab test
lab_test_counts = lab_data.groupby("test_abbr", as_index=False)["case_id"].nunique()
lab_test_counts.rename(columns={"case_id": "num_cases"}, inplace=True)

# Get total number of unique cases
total_cases = lab_data["case_id"].nunique()

# Calculate the percentage of missing data per lab test
lab_test_counts["missing_percentage"] = (1 - lab_test_counts["num_cases"] / total_cases) * 100

# Sort lab tests by frequency (most frequent first)
lab_test_counts.sort_values(by="num_cases", ascending=False, inplace=True)

# Filter lab tests that have less than 80% missing data
filtered_lab_tests = lab_test_counts[lab_test_counts["missing_percentage"] < 80][["test_abbr", "missing_percentage"]]

### 2.1 Removal of Specific Lab Tests  

Excluded tests fall into two categories:  

1. **Metadata Entries** (non-result data)  
2. **Highly Variable Urinary Tests** (fluctuate reducing reliability)  

In [8]:
remove_tests = [
    "EC3-U", "PH4-U", "BI3-U", "LK3-U", "PROT3", "NITR3", "KETO3", "SPEZ3",
    "FARBE3", "TRUEB3", "UST1", "BA-Ux", "LK-Ux", "KRI-Ux", "ERY-Ux", "PI-Ux", "EART",
    "GLUC3", "URO3", "ENTN1n", "EART"
]

# Remove specified lab tests from the filtered list  
filtered_lab_tests = filtered_lab_tests[~filtered_lab_tests["test_abbr"].isin(remove_tests)]

In [9]:
display(filtered_lab_tests)

# Print the number of rows (tests) in filtered_lab_tests
print(f"Number of tests in filtered_lab_tests: {filtered_lab_tests.shape[0]}")

Unnamed: 0,test_abbr,missing_percentage
1746,KA,14.163277
2007,Leukn,15.379529
1626,Hbn,15.380492
1197,Eryn,15.382097
1639,Hkn,15.38306
2981,THZn,15.384665
2063,MCHCn,15.384986
2065,MCHn,15.385307
2068,MCVn,15.385307
2622,RDWn,15.457852


Number of tests in filtered_lab_tests: 49


In [10]:
# Filter the lab_data to keep only rows where test_abbr is in the filtered_lab_tests list
lab_data_filtered = lab_data[lab_data['test_abbr'].isin(filtered_lab_tests['test_abbr'])]

# Display the first few rows of the filtered data
lab_data_filtered.head()

Unnamed: 0,patient_id,case_id,test_name,test_abbr,method_number,numeric_result,text_result,unit
0,1,171465,Natrium,Na,1,138.0,138.0,mmol/L
1,1,171465,Kalium,KA,3,4.6,4.6,mmol/L
2,1,171465,Hämolytisch,H-Se,42,4.0,4.0,Unknown
3,1,171465,Lipämisch,L-Se,43,3.0,3.0,Unknown
4,1,171465,Ikterisch,I-Se,44,1.0,1.0,Unknown


In [11]:
# Free up memory
del lab_data

## 3. Merge the two data sets

In [12]:
# Convert lab data to wide format, keeping only the first test result per patient-case
lab_data_wide = lab_data_filtered.pivot_table(
    index=['patient_id', 'case_id'],  # Keep patient_id and case_id as index
    columns='test_abbr',  # Pivot on 'test_abbr'
    values='numeric_result',  # Use numeric_result as values
    aggfunc='first' # First measurement = Initial test at admission
).reset_index()

# Display the first few rows of the wide-format data
lab_data_wide.head()

test_abbr,patient_id,case_id,ALAT,AP,ASAT,BASm#n,BIg,CA,CK,CO-HB,...,Quicks,RDWn,THZn,TNThsn,Tbga,UREA,pCO2,pH,pO2,tHb
0,1,171465,10.0,,18.0,,,,,,...,13.2,18.3,165.0,,,,,,,
1,1,333396,19.0,93.0,22.0,0.19,8.0,2.02,,,...,11.6,18.8,93.0,,,,,,,
2,2,27091,,,,,,,,,...,14.0,18.9,164.0,,,,,,,
3,2,36154,38.0,58.0,,,,,347.0,,...,12.7,14.4,131.0,367.0,,4.5,,,,
4,2,142617,15.0,65.0,19.0,0.04,16.0,2.39,,,...,19.2,17.6,159.0,,,5.2,,,,


In [13]:
# Merge lab_data with clinical_data on patient_id and case_id
merged_data = pd.merge(clinical_data, lab_data_wide, on=['patient_id', 'case_id'], how='inner')

In [14]:
display(merged_data)

Unnamed: 0,patient_id,case_id,discharge_type,sex,age,length_of_stay_days,diagnosis,diagnosis_category,ALAT,AP,...,Quicks,RDWn,THZn,TNThsn,Tbga,UREA,pCO2,pH,pO2,tHb
0,1,171465,Entlassung,f,73,28,A04.70,Infectious diseases,10.0,,...,13.2,18.3,165.0,,,,,,,
1,1,333396,Entlassung,f,73,34,K57.22,Digestive diseases,19.0,93.0,...,11.6,18.8,93.0,,,,,,,
2,2,27091,Entlassung,m,51,13,T84.5,Injury & poisoning,,,...,14.0,18.9,164.0,,,,,,,
3,2,36154,Entl.ext.Instit,m,50,12,I21.4,Circulatory diseases,38.0,58.0,...,12.7,14.4,131.0,367.0,,4.5,,,,
4,2,142617,Entl. in ex.KH,m,51,1,T84.5,Injury & poisoning,15.0,65.0,...,19.2,17.6,159.0,,,5.2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268982,240988,393440,Entlassung,m,79,2,K55.21,Digestive diseases,12.0,,...,10.3,13.8,176.0,,,,,,,
268983,240988,412516,Entlassung,m,79,10,I11.00,Circulatory diseases,11.0,,...,11.0,14.3,151.0,,,,,,,
268984,240988,425806,Entl.ext.Instit,m,79,7,M16.7,Musculoskeletal diseases,,,...,,13.2,219.0,,,,,,,
268985,240989,393141,Entlassung,m,62,1,I49.3,Circulatory diseases,,,...,10.9,13.6,269.0,,,,,,,


In [15]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268987 entries, 0 to 268986
Data columns (total 57 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   patient_id           268987 non-null  int64  
 1   case_id              268987 non-null  int64  
 2   discharge_type       268987 non-null  object 
 3   sex                  268987 non-null  object 
 4   age                  268987 non-null  int64  
 5   length_of_stay_days  268987 non-null  int64  
 6   diagnosis            268987 non-null  object 
 7   diagnosis_category   268987 non-null  object 
 8   ALAT                 109653 non-null  float64
 9   AP                   80227 non-null   float64
 10  ASAT                 111522 non-null  float64
 11  BASm#n               70557 non-null   float64
 12  BIg                  80361 non-null   float64
 13  CA                   85871 non-null   float64
 14  CK                   84380 non-null   float64
 15  CO-HB            

## 4. Save merged dataset

In [16]:
merged_data.to_csv(merged_data_path, index=False)