In [0]:
%pip install catboost

In [0]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [0]:
# Read data
df = spark.read.table("retail_catalog.retail_schema.rfm_table")
pd_model_df = df.toPandas()
pd_model_df.head()

In [0]:
#convert categorical fields to numerical
# Perform one-hot encoding on the 'product' column
#pd_model_df = pd.get_dummies(pd_model_df, columns=['shopping_mall','payment_method','category','gender'])

In [0]:
pd_model_df.head()

In [0]:
# --- 2. Define features (X) and target (y) ---
# We will use all relevant columns to predict the 'total_rever'
features = ['gender', 'age', 'category', 'quantity', 'price', 'rfm_score', 'payment_method', 'shopping_mall']
target = 'total_revenue'

X = pd_model_df[features]
y = pd_model_df[target]

# --- 3. Identify categorical features for CatBoost ---
# CatBoost will automatically handle these columns
cat_features = ['gender', 'category', 'payment_method', 'shopping_mall']

# --- 4. Split data into training and test sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} rows, testing on {len(X_test)} rows.")

In [0]:
# --- 5. Initialize and train the CatBoost model ---
# We provide the list of categorical features to the model
cat_model = CatBoostRegressor(
    iterations=100,
    learning_rate=0.1,
    loss_function='MAE', # Mean Absolute Error
    verbose=0 # Suppress training output
)

print("\nTraining CatBoost model...")
cat_model.fit(X_train, y_train, cat_features=cat_features)

In [0]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification

In [0]:
# --- 6. Make predictions on the test set ---
predictions = cat_model.predict(X_test)

# --- 7. Evaluate the model's performance ---
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print("\nPrediction complete.")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

print("\nSample Predictions vs. Actual values:")
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': predictions.round(2)
})
print(comparison_df)
print("Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

In [0]:
from mlflow.models import infer_signature

In [0]:
# --- 3. Start an MLflow Run ---
with mlflow.start_run() as run:
    # --- Define and log model parameters ---
    iterations=100,
    learning_rate=0.1,
    loss_function='MAE', # Mean Absolute Error
    verbose=0 # Suppress training output
    mlflow.log_param("iterations", iterations)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("loss_function", loss_function)
    mlflow.log_param("verbose", verbose)

    cat_model.fit(X_train, y_train, cat_features=cat_features)
    #model.fit(X_train, y_train)

    # --- Make predictions and log metrics ---
    # Infer signature from training data and predictions
    signature = infer_signature(X_train, cat_model.predict(X_train))
    #input_example = X_train.iloc[[0]] if hasattr(X_train, "iloc") else X_train[:1]
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # --- 4. Log the model and register it in the Model Registry ---
    # The 'registered_model_name' argument is crucial for automatic registration
    mlflow.catboost.log_model(cb_model=cat_model, artifact_path="CatBoostRegressor_model", registered_model_name="Customer_Segmentation_Model",
    signature=signature, input_example=X_train)

    print(f"MLflow Run ID: {run.info.run_id}")
    print(f"Model logged and registered under: Customer_Segmentation_Model")