In [248]:
import sys
import importlib
import os
import pandas as pd
import logging
import numpy as np
import re

# Remove old imports to avoid cache issues
if 'ingest_data' in sys.modules:
    del sys.modules['ingest_data']

# Add the correct folder to sys.path
sys.path.append("/Users/youngeddieb/PycharmProjects/BI-Analytics/BI-Analytics/Pipeline")

import ingest_data
importlib.reload(ingest_data)

print(ingest_data.__file__)
print(dir(ingest_data))


/Users/youngeddieb/PycharmProjects/BI-Analytics/BI-Analytics/Pipeline/ingest_data.py
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'appointments_path', 'creating_logger', 'doctors_path', 'extract_data', 'logging', 'os', 'pd']


In [249]:
doctors_df, appointments_df = ingest_data.extract_data(ingest_data.doctors_path,ingest_data.appointments_path)
doctors_df.head()
appointments_df.head()


2025-11-02 14:12:33,813 | INFO | === START: INGEST ===
2025-11-02 14:12:33,859 | INFO | Reading doctors file from: /Users/youngeddieb/PycharmProjects/BI-Analytics/BI-Analytics/Pipeline/datasets/Data Enginner's Doctors Excel - VIP Medical Group.xlsx
2025-11-02 14:12:33,896 | INFO | Doctors file loaded: 5 rows
2025-11-02 14:12:33,896 | INFO | Reading appointments file from: /Users/youngeddieb/PycharmProjects/BI-Analytics/BI-Analytics/Pipeline/datasets/Data Engineer's Appointments Excel - VIP Medical Group.xlsx
2025-11-02 14:12:33,926 | INFO | Appointments file loaded: 1025 rows
2025-11-02 14:12:33,927 | INFO | === END: INGEST ===


Unnamed: 0,booking_id,patient_id,doctor_id,booking_date,status
0,1,10.0,100.0,10/20/2025,confirmed
1,2,11.0,101.0,10/21/2025,cancelled
2,3,12.0,100.0,10/20/2025,confirmed
3,4,13.0,101.0,10/20/2025,confirmed
4,5,14.0,102.0,10/20/2025,cancelled


In [250]:
logger = logging.getLogger("transform")
if not logger.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

In [251]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names: lowercase, remove spaces/symbols."""
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
    )
    return df

In [252]:
def transform_doctors(doctors_df: pd.DataFrame) -> pd.DataFrame:
    """Clean and standardize doctors data."""
    logger.info("Transforming doctors dataset...")
    df = doctors_df.copy()
    df = normalize_columns(df)

    # --- Rename columns ---
    rename_map = {
        "doctor_id": "doctor_id",
        "name": "doctor_name",
        "specialty": "specialty",
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    # Remove duplicates based on doctor_id if available
    if "doctor_id" in df.columns:
        df = df.drop_duplicates(subset=["doctor_id"], keep="last")

    # Trim string columns
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].str.strip()

    # Add ingestion timestamp
    df["ingested_at"] = pd.Timestamp.utcnow()

    logger.info(f"Doctors cleaned: {len(df)} rows")
    return df


In [253]:
def transform_appointments(appointments_df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize appointments data.
    """
    logger.info("Transforming appointments dataset...")
    df = appointments_df.copy()
    df = normalize_columns(df)

    # --- Rename columns ---
    rename_map = {
        "booking_id": "appointment_id",
        "patientid": "patient_id",
        "doctorid": "doctor_id",
        "booking_date": "appointment_date",
        "date": "appointment_date",
        "datetime": "appointment_date",
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    # --- Clean appointment_id (remove non-numeric like 'X') ---
    if "appointment_id" in df.columns:
        df["appointment_id"] = (
            df["appointment_id"].astype(str)
            .str.replace(r"[^0-9]", "", regex=True)
        )
        df["appointment_id"] = pd.to_numeric(df["appointment_id"], errors="coerce").fillna(0).astype(int)

    # --- Convert patient_id and doctor_id to numeric ---
    for col in ["patient_id", "doctor_id"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # --- Fix dates: normalize formats and correct invalid years ---
    if "appointment_date" in df.columns:
        import re

        def fix_date(date_str):
            date_str = str(date_str).strip()
            # Match possible formats
            match = (
                re.match(r'(\d{1,2})/(\d{1,2})/(\d{4})', date_str) or   # MM/DD/YYYY
                re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_str) or   # YYYY-MM-DD
                re.match(r'(\d{4})/(\d{1,2})/(\d{1,2})', date_str)      # YYYY/MM/DD
            )

            if match:
                groups = match.groups()
                if '/' in date_str and date_str.index('/') < 3:  # MM/DD/YYYY
                    month, day, year = groups
                else:  # YYYY-MM-DD or YYYY/MM/DD
                    year, month, day = groups

                year = '2025' if int(year) > 2025 else year
                return f"{year}-{month.zfill(2)}-{day.zfill(2)}"

            return date_str  # leave untouched if no pattern matches

        df["appointment_date"] = df["appointment_date"].apply(fix_date)
        logger.info("Dates normalized to YYYY-MM-DD format")

    # --- Standardize status values ---
    if "status" in df.columns:
        df["status"] = (
            df["status"]
            .astype(str)
            .str.lower()
            .str.strip()
            .str.rstrip(".")
            .replace({"canceled": "cancelled"})
        )
        logger.info("Status values standardized")

    # --- Add ingestion timestamp ---
    df["ingested_at"] = pd.Timestamp.utcnow()

    return df


In [None]:
if __name__ == "__main__":
    # assume: sys, logger, extract_data, transform_doctors, transform_appointments already available

    # Keep your existing path append & import
    sys.path.append("/Users/youngeddieb/PyCharmProjects/BI-Analytics/BI-Analytics/Pipeline")
    from ingest_data import extract_data

    # Paths
    project_root = "/Users/youngeddieb/PyCharmProjects/BI-Analytics/BI-Analytics/Pipeline"
    doctors_path = f"{project_root}/datasets/Data Enginner's Doctors Excel - VIP Medical Group.xlsx"
    appointments_path = f"{project_root}/datasets/Data Engineer's Appointments Excel - VIP Medical Group.xlsx"

    # 1) Extract
    doctors_df, appointments_df = extract_data(doctors_path, appointments_path)

    # 2) Transform (kept only in memory)
    doctors_clean = transform_doctors(doctors_df)
    appointments_clean = transform_appointments(appointments_df)

    # 3) Print previews (no CSV writes)
    logger.info("Transformations complete. Holding DataFrames in memory.")

    print("\n=== Doctors (shape: {} rows x {} cols) ===".format(*doctors_clean.shape))
    print(doctors_clean.head())

    print("\n=== Appointments (shape: {} rows x {} cols) ===".format(*appointments_clean.shape))
    print(appointments_clean.head())

    # 4) Optional: Save output DataFrames as CSVs
    output_dir = f"{project_root}/output"
    os.makedirs(output_dir, exist_ok=True)

    doctors_output_path = f"{output_dir}/doctors_clean.csv"
    appointments_output_path = f"{output_dir}/appointments_clean.csv"

    doctors_clean.to_csv(doctors_output_path, index=False)
    appointments_clean.to_csv(appointments_output_path, index=False)

    logger.info(f"Saved transformed datasets to: {output_dir}")
    print(f"\nSaved files:\n- {doctors_output_path}\n- {appointments_output_path}")


2025-11-02 14:12:34,014 | INFO | === START: INGEST ===
2025-11-02 14:12:34,015 | INFO | Reading doctors file from: /Users/youngeddieb/PyCharmProjects/BI-Analytics/BI-Analytics/Pipeline/datasets/Data Enginner's Doctors Excel - VIP Medical Group.xlsx
2025-11-02 14:12:34,027 | INFO | Doctors file loaded: 5 rows
2025-11-02 14:12:34,028 | INFO | Reading appointments file from: /Users/youngeddieb/PyCharmProjects/BI-Analytics/BI-Analytics/Pipeline/datasets/Data Engineer's Appointments Excel - VIP Medical Group.xlsx
2025-11-02 14:12:34,057 | INFO | Appointments file loaded: 1025 rows
2025-11-02 14:12:34,058 | INFO | === END: INGEST ===
2025-11-02 14:12:34,058 | INFO | Transforming doctors dataset...
2025-11-02 14:12:34,059 | INFO | Doctors cleaned: 5 rows
2025-11-02 14:12:34,060 | INFO | Transforming appointments dataset...
2025-11-02 14:12:34,062 | INFO | Dates normalized to YYYY-MM-DD format
2025-11-02 14:12:34,063 | INFO | Status values standardized
2025-11-02 14:12:34,064 | INFO | Transfor


=== Doctors (shape: 5 rows x 4 cols) ===
   doctor_id    doctor_name specialty                      ingested_at
0        100      Dr. PÃ©rez      Vein 2025-11-02 19:12:34.059573+00:00
1        101      Dr. GÃ³mez      Pain 2025-11-02 19:12:34.059573+00:00
2        102    Dr. SÃ¡nchez      Vein 2025-11-02 19:12:34.059573+00:00
3        103  Dr. RodrÃ­guez      Pain 2025-11-02 19:12:34.059573+00:00
4        104   Dr. MartÃ­nez      Vein 2025-11-02 19:12:34.059573+00:00

=== Appointments (shape: 1025 rows x 6 cols) ===
   appointment_id  patient_id  doctor_id appointment_date     status  \
0               1          10        100       2025-10-20  confirmed   
1               2          11        101       2025-10-21  cancelled   
2               3          12        100       2025-10-20  confirmed   
3               4          13        101       2025-10-20  confirmed   
4               5          14        102       2025-10-20  cancelled   

                       ingested_at  
0 2025-