In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import joblib

# =============================================================================
# 1. Data Loading and Cleaning
# This section reproduces the successful cleaning steps from your notebook.
# =============================================================================
print("Loading and cleaning raw data...")

# Load the raw data
# Make sure the path to your raw data is correct
df = pd.read_csv('/content/Medicare_IP_Hospitals_by_Provider_and_Service_2023.csv', encoding='latin1')

# Rename columns for clarity
df = df.rename(columns={
    "ï»¿Rndrng_Prvdr_CCN": "provider_id",
    "Rndrng_Prvdr_Org_Name": "provider_name",
    "Rndrng_Prvdr_City": "city",
    "Rndrng_Prvdr_St": "state",
    "Rndrng_Prvdr_State_FIPS": "state_fips",
    "Rndrng_Prvdr_Zip5": "zip_code",
    "Rndrng_Prvdr_State_Abrvtn": "state_abbr",
    "Rndrng_Prvdr_RUCA": "ruca_code",
    "Rndrng_Prvdr_RUCA_Desc": "ruca_desc",
    "DRG_Cd": "drg_code",
    "DRG_Desc": "drg_desc",
    "Tot_Dschrgs": "total_discharges",
    "Avg_Submtd_Cvrd_Chrg": "avg_submitted_charge",
    "Avg_Tot_Pymt_Amt": "avg_total_payment",
    "Avg_Mdcr_Pymt_Amt": "avg_medicare_payment"
})

# Correct data types for money and discharge columns
numeric_columns = ['avg_submitted_charge', 'avg_total_payment', 'avg_medicare_payment', 'total_discharges']
for col in numeric_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].replace({'\$': '', ',': ''}, regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# =============================================================================
# 2. Feature Engineering
# This section reproduces your successful feature engineering steps.
# =============================================================================
print("Engineering new features...")

def categorize_procedure(drg_desc):
    drg_desc = drg_desc.lower()
    if 'transplant' in drg_desc or 'ecmo' in drg_desc or 'tracheostomy' in drg_desc:
        return 'Major Surgery/Intensive Care'
    elif 'cardiac' in drg_desc or 'heart' in drg_desc or 'valve' in drg_desc:
        return 'Cardiology'
    elif 'joint' in drg_desc or 'spinal fusion' in drg_desc or 'back & neck' in drg_desc:
        return 'Orthopedics'
    elif 'nervous system' in drg_desc or 'craniotomy' in drg_desc:
        return 'Neurology'
    elif 'pulmonary' in drg_desc or 'respiratory' in drg_desc:
        return 'Pulmonary'
    elif 'septicemia' in drg_desc:
        return 'Infections'
    else:
        return 'Other'

df['procedure_category'] = df['drg_desc'].apply(categorize_procedure)

# Apply the log transform to the target variable
df['avg_total_payment_log'] = np.log1p(df['avg_total_payment'])


# =============================================================================
# 3. Final Model Preparation
# =============================================================================
print("Preparing data for the final model...")

# Define final features and target
features = ['state_abbr', 'ruca_code', 'total_discharges', 'procedure_category']
target = 'avg_total_payment_log'

X = df[features]
y = df[target]

# We don't need a train-test split here, as we train the final pipeline on all available data.
# This is a common practice for creating a production model.

# Define the preprocessing steps
categorical_features = ['state_abbr', 'procedure_category']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder='passthrough'
)

# Use the best hyperparameters you found during tuning
best_params = {
    'num_leaves': 31,
    'n_estimators': 300,
    'min_child_samples': 30,
    'max_depth': 7,
    'learning_rate': 0.1,
    'random_state': 42
}

# =============================================================================
# 4. Build and Train the Final Pipeline
# =============================================================================

# Create the full pipeline object
final_pipeline_v2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(**best_params))
])

print("Training the final V2 pipeline on all data...")
# Train the pipeline on the entire dataset
final_pipeline_v2.fit(X, y)
print("Training complete.")


# =============================================================================
# 5. Save the Final Pipeline
# This is your final, production-ready model artifact for Phase 2.
# =============================================================================
joblib.dump(final_pipeline_v2, 'insurance_pipeline_v2.joblib')
print("\n✅ V2 pipeline saved successfully as 'insurance_pipeline_v2.joblib'!")

  df[col] = df[col].replace({'\$': '', ',': ''}, regex=True)


Loading and cleaning raw data...
Engineering new features...
Preparing data for the final model...
Training the final V2 pipeline on all data...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002640 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 146423, number of used features: 58
[LightGBM] [Info] Start training from score 9.560788
Training complete.

✅ V2 pipeline saved successfully as 'insurance_pipeline_v2.joblib'!
