In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import joblib

# --- 1. DATA PREPARATION ---
df = pd.read_csv('/content/insurance.csv')
df.drop_duplicates(inplace=True)

# Separate features and target
X = df.drop('charges', axis=1)
y = df['charges']

# Split the data BEFORE any preprocessing to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. DEFINE THE PREPROCESSING PIPELINE ---

# Identify which columns are categorical
categorical_features = ['sex', 'smoker', 'region']
# You could also have numerical features to scale here if needed

# Create a 'preprocessor' object. This will apply the OneHotEncoder
# to the categorical columns and leave the rest alone.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder='passthrough' # Keep the numerical columns as they are
)

# --- 3. DEFINE THE FULL MODEL PIPELINE ---

# Use the best parameters you found from tuning
best_params = {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}

# Chain the preprocessor and the model together in a Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(**best_params, random_state=42))
])

# --- 4. TRAIN THE ENTIRE PIPELINE ---
# Now, you just fit the pipeline. It handles all the steps internally.
print("Training the full pipeline...")
model_pipeline.fit(X_train, y_train)
print("Pipeline training complete.")

# --- 5. SAVE THE PIPELINE ---
# You save the single pipeline object, not just the model.
joblib.dump(model_pipeline, 'insurance_pipeline_v1.joblib')
print("\nFull pipeline saved successfully as 'insurance_pipeline_v1.joblib'")

Training the full pipeline...
Pipeline training complete.

Full pipeline saved successfully as 'insurance_pipeline_v1.joblib'
