In [193]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import joblib
import os

In [194]:
import xgboost
print(xgboost.__version__)


3.0.5


In [195]:

xgb_model = xgb.XGBRegressor()
print(type(xgb_model))


<class 'xgboost.sklearn.XGBRegressor'>


In [196]:
# Load synthetic data
df = pd.read_csv("../data/sales_data.csv", parse_dates=['date'])


for each product_id, it looks at the last 7 days of sales (units_sold).
It computes a rolling average → “on average, how many units were sold in the last week?”
Helps smooth out fluctuations and capture short-term demand trends.

In [197]:
df['moving_avg_demand'] = df.groupby('product_id')['units_sold'].transform(
    lambda x: x.rolling(7, min_periods=1).mean()
)

Price elasticity measures how sensitive demand is to price changes.
Formula: % change in demand ÷ % change in price.
Example: If price goes up 10% and demand drops 20%, elasticity = -2 (demand is very sensitive).
This feature helps the model understand if lowering the price will significantly increase sales.

In [198]:
df['price_elasticity'] = (df['units_sold'].pct_change() / df['historical_price'].pct_change()).fillna(0)

Similar to moving average but with a 30-day window.
Captures long-term sales trends (seasonality, steady growth/decline).

In [199]:
df['trend_factor'] = df.groupby('product_id')['units_sold'].transform(lambda x: x.rolling(30, min_periods=1).mean())

The model will use these features to predict the optimal price.
Key features:
Demand-related: units_sold, moving_avg_demand, trend_factor
Competition-related: competitor_price
Inventory-related: stock_level
Behavioral signals: views (how many people looked at the product)
Contextual factors: day_of_week, holiday_flag
Elasticity: how demand reacts to price changes
Target variable = historical_price → the model tries to learn pricing patterns from past data.

In [200]:
features = ['units_sold', 'competitor_price', 'stock_level', 'day_of_week', 'holiday_flag',
            'views', 'moving_avg_demand', 'price_elasticity', 'trend_factor']
target = 'historical_price'

In [201]:
# Feature engineering
df['moving_avg_demand'] = df.groupby('product_id')['units_sold'].transform(lambda x: x.rolling(7, min_periods=1).mean())
df['price_elasticity'] = (df['units_sold'].pct_change() / df['historical_price'].pct_change()).fillna(0)
df['trend_factor'] = df.groupby('product_id')['units_sold'].transform(lambda x: x.rolling(30, min_periods=1).mean())

features = ['units_sold', 'competitor_price', 'stock_level', 'day_of_week', 'holiday_flag',
            'views', 'moving_avg_demand', 'price_elasticity', 'trend_factor']
target = 'historical_price'

In [202]:
X = df[features]
y = df[target]

In [203]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [204]:
os.makedirs("df", exist_ok=True) 

In [205]:
joblib.dump(X_test, "df/X_test.pkl")
joblib.dump(y_test, "df/y_test.pkl")

['df/y_test.pkl']

In [206]:
import numpy as np
import pandas as pd

print(np.isinf(X_train).sum())   # Count of infinite values
print(np.isnan(X_train).sum())   # Count of NaN values


units_sold                0
competitor_price          0
stock_level               0
day_of_week               0
holiday_flag              0
views                     0
moving_avg_demand         0
price_elasticity     116584
trend_factor              0
dtype: int64
units_sold           0
competitor_price     0
stock_level          0
day_of_week          0
holiday_flag         0
views                0
moving_avg_demand    0
price_elasticity     0
trend_factor         0
dtype: int64


In [207]:
import numpy as np

# Replace inf/-inf with NaN in both train and test sets
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Fill specific column 'price_elasticity' NaNs with 0
X_train['price_elasticity'] = X_train['price_elasticity'].fillna(0)
X_test['price_elasticity'] = X_test['price_elasticity'].fillna(0)

# Fill any remaining NaNs in all columns with 0
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Optional: Verify
print("Train set missing values:\n", X_train.isnull().sum())
print("Test set missing values:\n", X_test.isnull().sum())


Train set missing values:
 units_sold           0
competitor_price     0
stock_level          0
day_of_week          0
holiday_flag         0
views                0
moving_avg_demand    0
price_elasticity     0
trend_factor         0
dtype: int64
Test set missing values:
 units_sold           0
competitor_price     0
stock_level          0
day_of_week          0
holiday_flag         0
views                0
moving_avg_demand    0
price_elasticity     0
trend_factor         0
dtype: int64


Key parameters:
n_estimators=1000 → maximum number of boosting iterations (trees).
learning_rate=0.05 → controls step size (smaller = slower but more accurate learning).
max_depth=7 → limits how deep trees can grow (prevents overfitting).
random_state=42 → ensures reproducibility.

In [208]:
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    random_state=42
)

Training on (X_train, y_train).
Evaluating performance on a validation set (X_test, y_test).
Early Stopping: If the model doesn’t improve for 50 rounds, training stops early to avoid overfitting.
Log Evaluation: Prints evaluation metrics every 50 iterations so you can monitor progress.

In [209]:
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)
joblib.dump(lgb_model, '../models/lgb_model.pkl')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 146400, number of used features: 9
[LightGBM] [Info] Start training from score 275.740827
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l2: 194.668
[100]	valid_0's l2: 98.0308
[150]	valid_0's l2: 97.3593
Early stopping, best iteration is:
[149]	valid_0's l2: 97.3554


['../models/lgb_model.pkl']

XGBoost uses its own optimized data structure (DMatrix) for training.
This makes training faster and memory-efficient.

In [210]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [211]:
import xgboost as xgb
from xgboost.callback import EarlyStopping

Similar to your LightGBM setup.
reg:squarederror → standard regression objective.
Using MAE as the metric since it’s interpretable in pricing.

In [212]:
params = {
    "objective": "reg:squarederror",  # regression problem
    "learning_rate": 0.05,            # step size
    "max_depth": 7,                   # tree depth
    "eval_metric": "mae"              # evaluation metric = Mean Absolute Error
}

Trains up to 1000 boosting rounds (trees).
Uses early stopping: stops if no improvement for 10 rounds.
save_best=True → keeps the best-performing model.

In [213]:
xgb_model= xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, "validation")],
    callbacks=[EarlyStopping(rounds=10, save_best=True)]
)

[0]	validation-mae:103.39464
[1]	validation-mae:98.25597
[2]	validation-mae:93.37557
[3]	validation-mae:88.74124


[4]	validation-mae:84.33767
[5]	validation-mae:80.15652
[6]	validation-mae:76.18469
[7]	validation-mae:72.41474
[8]	validation-mae:68.83322
[9]	validation-mae:65.43345
[10]	validation-mae:62.20584
[11]	validation-mae:59.14166
[12]	validation-mae:56.23220
[13]	validation-mae:53.47088
[14]	validation-mae:50.84958
[15]	validation-mae:48.36273
[16]	validation-mae:46.00198
[17]	validation-mae:43.76359
[18]	validation-mae:41.63969
[19]	validation-mae:39.62636
[20]	validation-mae:37.71585
[21]	validation-mae:35.90444
[22]	validation-mae:34.18773
[23]	validation-mae:32.56018
[24]	validation-mae:31.01853
[25]	validation-mae:29.55871
[26]	validation-mae:28.17604
[27]	validation-mae:26.86707
[28]	validation-mae:25.62770
[29]	validation-mae:24.45620
[30]	validation-mae:23.34763
[31]	validation-mae:22.30014
[32]	validation-mae:21.31081
[33]	validation-mae:20.37618
[34]	validation-mae:19.49488
[35]	validation-mae:18.66526
[36]	validation-mae:17.88461
[37]	validation-mae:17.15046
[38]	validation-mae:

In [214]:
print("Best iteration:", xgb_model.best_iteration)
print("Best score:", xgb_model.best_score)

Best iteration: 150
Best score: 8.125601074031142


In [215]:
model_dir = r"C:\Users\KIIT\Desktop\dynamic_pricing_\models"
os.makedirs(model_dir, exist_ok=True)  # creates 'models' if it doesn't exist

xgb_model.save_model(os.path.join(model_dir, "xgb_model.json"))

print("XGBoost model saved successfully!")

XGBoost model saved successfully!


Inputs:
y_true: The actual prices (y_test).
y_pred: The predicted prices from the model.
model_name: A string ("LightGBM" or "XGBoost") just for labeling results.
Metrics:

MAE (Mean Absolute Error) → average absolute difference between predicted and true prices.
→ “On average, how far off were we?”

RMSE (Root Mean Squared Error) → square root of mean squared error.
→ Similar to MAE but penalizes larger errors more (important in pricing where big mistakes hurt).

Output: Prints a neat summary and returns (mae, rmse).

In [216]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # manual RMSE
    print(f"{model_name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}")
    return mae, rmse

# LightGBM predictions
y_pred_lgb = lgb_model.predict(X_test)
evaluate_model(y_test, y_pred_lgb, "LightGBM")

# XGBoost predictions
y_pred_xgb = xgb_model.predict(dtest)
evaluate_model(y_test, y_pred_xgb, "XGBoost")

LightGBM - MAE: 8.1385, RMSE: 9.8669
XGBoost - MAE: 8.1256, RMSE: 9.8560


(8.125600814819336, np.float64(9.856024171205085))

Creating two separate runs in the same experiment:
One for LightGBM
One for XGBoost

Each run logs:
Model artifact (lgb_model, xgb_model)
Metrics (MAE, RMSE)

In the MLflow UI (mlflow ui):
there -two runs side by side under the dynamic_pricing experiment.

filter/sort runs by MAE, RMSE to decide the winner.yeye