## Provider-Level RTT Data Loader

This script ingests Referral to Treatment (RTT) data from Excel files at the *Provider* level and loads it into the `rtt_data` table in a PostgreSQL database. It supports multiple RTT pathways including `Incomplete`, `Admitted`, `Non-Admitted`, `New`, and `DTA`.

### How It Works:
- **User Input**: Set the `FILE_PATH`, `YEAR`, `MONTH`, and `PATHWAY_TYPE` for the data file.
- **Sheet Selection**: Automatically chooses the relevant Excel sheet (e.g., "Provider" or "Provider with DTA").
- **Cleaning**: Skips metadata rows, parses percentage columns, and handles edge cases (e.g., zeros and hyphens).
- **Transformation**: Melts the data into a long format with standard column names.
- **Validation**: Handles type conversions and missing values.
- **Loading**: Writes data safely into the `rtt_data` table using SQLAlchemy transactions.

Ensure that the pathway type matches the structure of the input Excel sheet and that the database connection string is correctly configured.


In [None]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
from sqlalchemy.exc import SQLAlchemyError


In [None]:
# === USER INPUT ===
FILE_PATH = '../data/providers/NewPeriods_apr2023_march2024/New-Periods-Provider-12-Mar24.xlsx' 
YEAR = 2024
MONTH = 3
PATHWAY_TYPE = 'New'  # 'Incomplete', 'Admitted', 'Non-Admitted', 'New', 'DTA'

# === METRIC LOOKUP TABLE ===
METRICS_LOOKUP = {
     'Incomplete': [
        'Total number of incomplete pathways',
        '% within 18 weeks',
        'Average (median) waiting time (in weeks)',
        '92nd percentile waiting time (in weeks)',
        'Total 52 plus weeks'
    ],
     'Admitted': [
        'Total number of completed pathways (all)',
        'Average (median) waiting time (in weeks)',
        '95th percentile waiting time (in weeks)',
        'Total 52 plus weeks'
    ],
     'Non-Admitted': [
        'Total number of completed pathways (all)',
        'Average (median) waiting time (in weeks)',
        '95th percentile waiting time (in weeks)',
        'Total 52 plus weeks'
    ],
     'New': [
        'Number of new RTT clock starts during the month'
    ],
   'DTA': [
    'Total number of incomplete pathways with a decision to admit for treatment',
    '% of incomplete pathways with a decision to admit for treatment'
    ]
}

# === VALIDATION ===
if PATHWAY_TYPE not in METRICS_LOOKUP:
    raise ValueError(f"Unsupported pathway type: {PATHWAY_TYPE}")

# === DB CONNECTION ===
engine = create_engine("postgresql://postgres:<password>@localhost:5432/nhs_dashboard")

# === LOAD + TRANSFORM ===
sheet_name = "Provider with DTA" if PATHWAY_TYPE == "DTA" else "Provider"
df = pd.read_excel(FILE_PATH, sheet_name=sheet_name, skiprows=13)

# Convert percentage strings to float (remove % symbol)
PERCENT_COLUMNS = {
    'DTA': ['% of incomplete pathways with a decision to admit for treatment'],
    'Incomplete': ['% within 18 weeks']
}

if PATHWAY_TYPE in PERCENT_COLUMNS:
    for col in PERCENT_COLUMNS[PATHWAY_TYPE]:
        if col in df.columns:
            df[col] = (
                        df[col]
                        .astype(str)
                        .str.strip()
                        .replace('-', None)
                        .str.rstrip('%')
                        .astype(float)
                    )



expected_id_vars = [
    'Provider Code', 'Provider Name', 'Region Code',
    'Treatment Function Code', 'Treatment Function'
]

# If total pathways is 0, set % within 18 weeks to NaN
if PATHWAY_TYPE == 'Incomplete':
    mask = df['Total number of incomplete pathways'] == 0
    df.loc[mask, '% within 18 weeks'] = None


melted = df.melt(
    id_vars=expected_id_vars,
    value_vars=METRICS_LOOKUP[PATHWAY_TYPE],
    var_name='metric',
    value_name='value'
)

# Convert value column to numeric, force bad values to NaN
melted['value'] = pd.to_numeric(melted['value'], errors='coerce')

# Add metadata
melted['year'] = YEAR
melted['month'] = MONTH
melted['pathway_type'] = PATHWAY_TYPE
melted['geo_level'] = 'Provider'

# Rename for DB
melted = melted.rename(columns={
    'Provider Code': 'org_code',
    'Provider Name': 'org_name',
    'Region Code': 'region_code',
    'Treatment Function Code': 'treatment_function_code',
    'Treatment Function': 'treatment_function'
})

# === LOAD TO POSTGRES ===
try:
    with engine.begin() as connection:
        melted.to_sql('rtt_data', con=connection, if_exists='append', index=False)
    print(f"Loaded {len(melted)} rows from {FILE_PATH}")
except SQLAlchemyError as e:
    print(f"ERROR: Data load failed for {FILE_PATH}")
    print(str(e))

print(f"Loaded {len(melted)} rows from {FILE_PATH}")
