In [3]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


In [4]:
DATA_DIR = Path(".")  # Current directory

# List all CSV files to verify
csv_files = list(DATA_DIR.glob("*.csv"))
print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  - {f.name}")

Found 8 CSV files:
  - baseline_outcome_dataset.csv
  - BCVA.csv
  - CST.csv
  - Data.csv
  - Demographics.csv
  - DRSS.csv
  - PrimeDR Biomarker Clinical Data Images.csv
  - PrimeDR Clinical Data Images.csv


# Demographic CSV

In [8]:
# Read Demographics.csv
demo_df = pd.read_csv("Demographics.csv", header=[0,1])

print("Shape:", demo_df.shape)
print("\nColumn names:")
print(demo_df.columns.tolist())
print("\n" + "="*80)
print("First 5 rows:")
demo_df.head()

Shape: (40, 20)

Column names:
[('Patient Information', 'Patient \nID'), ('Unnamed: 1', 'Treatment Arm'), ('Unnamed: 2', 'Study\n Eye'), ('Unnamed: 3', 'Age'), ('Unnamed: 4', 'Gender'), ('Unnamed: 5', 'Ethnicity'), ('Unnamed: 6', 'Race'), ('Diabetes', 'Type of\n Diabetes'), ('Unnamed: 8', 'Number of Years with Diabetes'), ('Unnamed: 9', 'Baseline HbA1c'), ('Unnamed: 10', 'W24 HbA1c'), ('Unnamed: 11', 'W52 HbA1c'), ('Unnamed: 12', 'W76 HbA1c'), ('Unnamed: 13', 'W104 HbA1c'), ('Unnamed: 14', 'BMI (kg/m^2)'), ('Baseline', 'ETDRS BCVA'), ('Unnamed: 16', 'CST'), ('Unnamed: 17', 'Injection'), ('Unnamed: 18', 'DRSS'), ('Unnamed: 19', 'Leakage Index')]

First 5 rows:


Unnamed: 0_level_0,Patient Information,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Diabetes,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Baseline,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
Unnamed: 0_level_1,Patient \nID,Treatment Arm,Study\n Eye,Age,Gender,Ethnicity,Race,Type of\n Diabetes,Number of Years with Diabetes,Baseline HbA1c,W24 HbA1c,W52 HbA1c,W76 HbA1c,W104 HbA1c,BMI (kg/m^2),ETDRS BCVA,CST,Injection,DRSS,Leakage Index
0,01-001,2,OS,44,M,N H/L,White,2,20,7.1,8.7,8.4,9.1,8.4,,97,275,Yes,53,1.59
1,01-002,2,OD,56,F,N H/L,White,2,25,11.3,9.1,,,,34.484657,68,238,Yes,47,3.56
2,01-013,2,OD,38,M,H/L,White,1,13,8.0,9.5,9.6,11.7,9.8,25.997929,88,303,Yes,53,5.08
3,01-014,2,OS,55,M,N H/L,White,2,12,10.1,7.2,7.4,7.4,7.4,31.871377,95,256,Yes,53,2.34
4,01-023,2,OD,56,M,H/L,White,2,22,7.1,6.7,5.9,6.9,,35.669938,81,267,Yes,61,4.67


In [13]:
# Read with multi-level headers
demo_df = pd.read_csv("Demographics.csv", header=[0,1])

# Flatten the multi-level columns properly
# Strategy: Use level 1 (specific names), but if it's empty, use level 0
new_cols = []
for col in demo_df.columns:
    level0, level1 = col[0], col[1]
    
    # Clean whitespace
    level0 = level0.strip() if isinstance(level0, str) else str(level0)
    level1 = level1.strip() if isinstance(level1, str) else str(level1)
    
    # If level1 has actual content (not just whitespace), use it
    # Otherwise use level0
    if level1 and level1 != 'nan' and not level1.startswith('Unnamed'):
        new_cols.append(level1)
    elif level0 and not level0.startswith('Unnamed'):
        new_cols.append(level0)
    else:
        new_cols.append(f"{level0}_{level1}")

demo_df.columns = new_cols

print("Column names after flattening:")
print(demo_df.columns.tolist())
print("\nFirst 3 rows:")
demo_df.head(3)

Column names after flattening:
['Patient \nID', 'Treatment Arm', 'Study\n Eye', 'Age', 'Gender', 'Ethnicity', 'Race', 'Type of\n Diabetes', 'Number of Years with Diabetes', 'Baseline HbA1c', 'W24 HbA1c', 'W52 HbA1c', 'W76 HbA1c', 'W104 HbA1c', 'BMI (kg/m^2)', 'ETDRS BCVA', 'CST', 'Injection', 'DRSS', 'Leakage Index']

First 3 rows:


Unnamed: 0,Patient \nID,Treatment Arm,Study\n Eye,Age,Gender,Ethnicity,Race,Type of\n Diabetes,Number of Years with Diabetes,Baseline HbA1c,W24 HbA1c,W52 HbA1c,W76 HbA1c,W104 HbA1c,BMI (kg/m^2),ETDRS BCVA,CST,Injection,DRSS,Leakage Index
0,01-001,2,OS,44,M,N H/L,White,2,20,7.1,8.7,8.4,9.1,8.4,,97,275,Yes,53,1.59
1,01-002,2,OD,56,F,N H/L,White,2,25,11.3,9.1,,,,34.484657,68,238,Yes,47,3.56
2,01-013,2,OD,38,M,H/L,White,1,13,8.0,9.5,9.6,11.7,9.8,25.997929,88,303,Yes,53,5.08


In [None]:
# Check basic info
print("="*80)
print("DEMOGRAPHICS DATA SUMMARY")
print("="*80)

print(f"\nTotal patients: {len(demo_df)}")
print(f"\nUnique Patient IDs: {demo_df.iloc[:, 0].nunique()}")
print(f"\nFirst column values (Patient IDs):")
print(demo_df.iloc[:5, 0].values)

# check cats and dist.
print(f"\n--- Categorical Variables ---")
if 'Study_Eye' in demo_df.columns or 'Study\n Eye' in demo_df.columns:
    eye_col = 'Study_Eye' if 'Study_Eye' in demo_df.columns else demo_df.columns[2]
    print(f"\nStudy Eye distribution:")
    print(demo_df[eye_col].value_counts())

# Show numeric columns
print(f"\n--- Numeric Variables (summary) ---")
numeric_cols = demo_df.select_dtypes(include=[np.number]).columns
print(f"Numeric columns: {list(numeric_cols)}")

DEMOGRAPHICS DATA SUMMARY

Total patients: 40

Unique Patient IDs: 40

First column values (Patient IDs):
['01-001' '01-002' '01-013' '01-014' '01-023']

--- Categorical Variables ---

Study Eye distribution:
Study\n Eye
OD    24
OS    16
Name: count, dtype: int64

--- Numeric Variables (summary) ---
Numeric columns: ['Treatment Arm', 'Age', 'Type of\n Diabetes', 'Number of Years with Diabetes', 'Baseline HbA1c', 'W24 HbA1c', 'W76 HbA1c', 'W104 HbA1c', 'BMI (kg/m^2)', 'ETDRS BCVA', 'CST', 'DRSS', 'Leakage Index']


In [None]:
# Basic statistics on numerics
print("\nBasic statistics:")
print(demo_df.describe())


Basic statistics:
       Treatment Arm        Age  Type of\n Diabetes  \
count       40.00000  40.000000           40.000000   
mean         1.50000  50.825000            1.875000   
std          0.50637  10.876669            0.334932   
min          1.00000  25.000000            1.000000   
25%          1.00000  45.000000            2.000000   
50%          1.50000  51.500000            2.000000   
75%          2.00000  56.500000            2.000000   
max          2.00000  72.000000            2.000000   

       Number of Years with Diabetes  Baseline HbA1c  W24 HbA1c  W76 HbA1c  \
count                      40.000000         40.0000  30.000000  20.000000   
mean                       15.825000          9.1150   8.503333   9.215000   
std                         8.421241          2.2147   1.788562   2.324305   
min                         0.000000          5.1000   4.600000   6.500000   
25%                        11.000000          7.5500   7.400000   7.325000   
50%              

In [18]:
print(f"\n--- Missing Data ---")
print(demo_df.isnull().sum())


--- Missing Data ---
Patient \nID                      0
Treatment Arm                     0
Study\n Eye                       0
Age                               0
Gender                            0
Ethnicity                         0
Race                              0
Type of\n Diabetes                0
Number of Years with Diabetes     0
Baseline HbA1c                    0
W24 HbA1c                        10
W52 HbA1c                        11
W76 HbA1c                        20
W104 HbA1c                       19
BMI (kg/m^2)                      1
ETDRS BCVA                        0
CST                               0
Injection                         0
DRSS                              0
Leakage Index                     0
dtype: int64


Key findings:

✅ 40 eyes (24 OD, 16 OS) - this matches your thesis document
✅ Age range: 25-72 years (mean ~51)
    years with Diabetes: 0-33 yrs
    baseline Hba1c: 5.1 ~ 14.7
✅ Baseline BCVA: 83.3 ± 7.9 letters
✅ Baseline CST: 270 ± 24 μm
⚠️ Some missing HbA1c at follow-up visits (but baseline is complete)

# PrimeDR Clinical Data Images.csv

In [19]:
# Load PrimeDR Clinical Data Images.csv
clinical_df = pd.read_csv("PrimeDR Clinical Data Images.csv")

print("="*80)
print("CLINICAL DATA IMAGES (32K+ images with labels)")
print("="*80)

print(f"\nShape: {clinical_df.shape}")
print(f"\nColumns: {clinical_df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(clinical_df.head())

print(f"\n--- Data Summary ---")
print(f"Total image records: {len(clinical_df)}")
print(f"Unique patients: {clinical_df['Patient_ID'].nunique()}")
print(f"Unique eyes: {clinical_df['Eye_ID'].nunique()}")

print(f"\n--- File Path Examples (first 3) ---")
for path in clinical_df['File_Path'].head(3):
    print(f"  {path}")

CLINICAL DATA IMAGES (32K+ images with labels)

Shape: (32337, 5)

Columns: ['File_Path', 'BCVA', 'CST', 'Eye_ID', 'Patient_ID']

First 5 rows:
                         File_Path  BCVA    CST  Eye_ID  Patient_ID
0  /Prime_FULL/01-001/W0/OS/27.png  97.0  275.0      58          58
1   /Prime_FULL/01-001/W0/OS/0.png  97.0  275.0      58          58
2   /Prime_FULL/01-001/W0/OS/1.png  97.0  275.0      58          58
3  /Prime_FULL/01-001/W0/OS/10.png  97.0  275.0      58          58
4  /Prime_FULL/01-001/W0/OS/11.png  97.0  275.0      58          58

--- Data Summary ---
Total image records: 32337
Unique patients: 40
Unique eyes: 40

--- File Path Examples (first 3) ---
  /Prime_FULL/01-001/W0/OS/27.png
  /Prime_FULL/01-001/W0/OS/0.png
  /Prime_FULL/01-001/W0/OS/1.png


PRIME DATASET PREPROCESSING - PHASE 1: INSPECTION


### STEP 1: INSPECTING CSV STRUCTURES ###


INSPECTING: Demographics

Shape: 5 rows × 20 columns

Column names:
['Patient Information', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Diabetes', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Baseline', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19']

--- First 3 rows (showing first 10 columns) ---
  Patient Information     Unnamed: 1   Unnamed: 2 Unnamed: 3 Unnamed: 4  \
0        Patient \nID  Treatment Arm  Study\n Eye        Age     Gender   
1              01-001              2           OS         44          M   
2              01-002              2           OD         56          F   

  Unnamed: 5 Unnamed: 6            Diabetes                     Unnamed: 8  \
0  Ethnicity       Race  Type of\n Diabetes  Number of Years with Diabetes   
1      N H/L      White            