In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline



In [8]:
df = pd.read_csv("dprice/synthetic_dynamic_pricing_data.csv")
df.head()

Unnamed: 0,timestamp,hour,day_of_week,region,weather,demand,inventory,competitor_price,actual_price
0,2024-01-01 00:00:00,0,0,TX,Cloudy,6,41,26.14,26.06
1,2024-01-01 01:00:00,1,0,NY,Rainy,3,49,26.19,26.93
2,2024-01-01 02:00:00,2,0,TX,Rainy,7,45,23.84,25.38
3,2024-01-01 03:00:00,3,0,TX,Sunny,7,43,27.19,27.97
4,2024-01-01 04:00:00,4,0,NY,Rainy,4,41,29.86,32.6


In [11]:
from sklearn.preprocessing import LabelEncoder


In [12]:
df.drop(columns=['timestamp'], inplace=True)


In [99]:
df.head()

Unnamed: 0,hour,day_of_week,region,weather,demand,inventory,competitor_price,actual_price
0,0,0,TX,Cloudy,6,41,26.14,26.06
1,1,0,NY,Rainy,3,49,26.19,26.93
2,2,0,TX,Rainy,7,45,23.84,25.38
3,3,0,TX,Sunny,7,43,27.19,27.97
4,4,0,NY,Rainy,4,41,29.86,32.6


In [13]:
region_encoder = LabelEncoder()
weather_encoder = LabelEncoder()

df['region'] = region_encoder.fit_transform(df['region'])
df['weather'] = weather_encoder.fit_transform(df['weather'])

# Create demand to inventory ratio
df['demand_inventory_ratio'] = df['demand'] / (df['inventory'] + 1e-5)  # Add small value to avoid division by 0

# Flag for peak hour (e.g., 7–9 AM and 5–7 PM)
df['is_peak_hour'] = df['hour'].apply(lambda x: 1 if x in [7, 8, 17, 18] else 0)

# Flag for weekend (Saturday=5, Sunday=6)
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

In [14]:
columns_order = [
    'hour', 'day_of_week', 'region', 'weather',
    'demand', 'inventory', 'competitor_price',
    'demand_inventory_ratio', 'is_peak_hour', 'is_weekend',
    'actual_price'  # Target
]

df = df[columns_order]

In [15]:
df

Unnamed: 0,hour,day_of_week,region,weather,demand,inventory,competitor_price,demand_inventory_ratio,is_peak_hour,is_weekend,actual_price
0,0,0,2,0,6,41,26.14,0.146341,0,0,26.06
1,1,0,1,1,3,49,26.19,0.061224,0,0,26.93
2,2,0,2,1,7,45,23.84,0.155556,0,0,25.38
3,3,0,2,2,7,43,27.19,0.162791,0,0,27.97
4,4,0,1,1,4,41,29.86,0.097561,0,0,32.60
...,...,...,...,...,...,...,...,...,...,...,...
9995,11,3,2,2,3,46,26.04,0.065217,0,0,26.56
9996,12,3,1,0,5,48,29.04,0.104167,0,0,29.40
9997,13,3,0,2,6,39,27.30,0.153846,0,0,27.32
9998,14,3,0,1,6,46,29.73,0.130435,0,0,31.44


In [16]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Numerical Features : 11
Num of Categorical Features : 0
Num of Discrete Features : 6
Num of Continuous Features : 5


In [17]:
X = df.drop(columns=['actual_price'])
y = df[['actual_price']]

In [18]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = ['demand', 'inventory', 'competitor_price', 'demand_inventory_ratio']
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])


In [19]:
X = X_scaled

In [20]:
pd.DataFrame(X)

Unnamed: 0,hour,day_of_week,region,weather,demand,inventory,competitor_price,demand_inventory_ratio,is_peak_hour,is_weekend
0,0,0,2,0,-0.461180,0.202068,-0.608140,-0.477166,0,0
1,1,0,1,1,-0.759081,0.965669,-0.588551,-0.625353,0,0
2,2,0,2,1,-0.361880,0.583868,-1.509229,-0.461125,0,0
3,3,0,2,2,-0.361880,0.392968,-0.196773,-0.448528,0,0
4,4,0,1,1,-0.659780,0.202068,0.849274,-0.562092,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,11,3,2,2,-0.759081,0.679319,-0.647317,-0.618401,0,0
9996,12,3,1,0,-0.560480,0.870219,0.528016,-0.550592,0,0
9997,13,3,0,2,-0.461180,0.011168,-0.153677,-0.464101,0,0
9998,14,3,0,1,-0.461180,0.679319,0.798343,-0.504859,0,0


In [21]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,   
    random_state=42
)

In [22]:
X_train


Unnamed: 0,hour,day_of_week,region,weather,demand,inventory,competitor_price,demand_inventory_ratio,is_peak_hour,is_weekend
9254,14,0,2,1,-0.461180,0.774769,-0.851042,-0.509691,0,0
1561,1,2,1,2,-0.659780,0.488418,1.930581,-0.573673,0,0
1670,14,6,1,2,-0.858381,0.965669,0.441825,-0.660883,0,1
6087,15,1,0,0,-0.759081,0.870219,-0.294717,-0.623132,0,0
6669,21,4,0,2,-0.063979,0.202068,0.386976,-0.307315,0,0
...,...,...,...,...,...,...,...,...,...,...
5734,22,0,0,2,-0.461180,0.392968,0.171498,-0.489016,0,0
5191,7,6,1,1,1.922025,-1.897835,0.904123,2.016967,1,1
5390,14,0,1,0,-0.659780,0.870219,-0.036144,-0.586862,0,0
860,20,0,0,0,-0.361880,0.774769,-0.408333,-0.472649,0,0


In [70]:
y_train.shape, y_test.shape

((8000, 1), (2000, 1))

In [71]:
y

Unnamed: 0,actual_price
0,26.06
1,26.93
2,25.38
3,27.97
4,32.60
...,...
9995,26.56
9996,29.40
9997,27.32
9998,31.44


In [23]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [24]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [25]:
models = {
    "Xgboost Regressor":XGBRegressor()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Xgboost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.3460
- Mean Absolute Error: 0.2704
- R2 Score: 0.9903
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5572
- Mean Absolute Error: 0.4437
- R2 Score: 0.9744




In [26]:
xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}

In [27]:
randomcv_models = [
                   ("XGboost",XGBRegressor(),xgboost_params)
                   
                   ]

In [28]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for XGboost -------------------
{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1}


In [29]:
models = {
     "Xgboost Regressor":XGBRegressor(n_estimators= 100,learning_rate=0.1,
                                     max_depth=5,colsample_bytree=1)
    
}

In [30]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Xgboost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4632
- Mean Absolute Error: 0.3700
- R2 Score: 0.9826
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5194
- Mean Absolute Error: 0.4167
- R2 Score: 0.9777




In [33]:
import joblib

# Save model and preprocessing objects
joblib.dump(model, "xgb_model.pkl")
print("Model saved as xgb_model.pkl")

joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as scaler.pkl")

joblib.dump(region_encoder, "region_encoder.pkl")
print("Region encoder saved as region_encoder.pkl")

joblib.dump(weather_encoder, "weather_encoder.pkl")
print("Weather encoder saved as weather_encoder.pkl")




Model saved as xgb_model.pkl
Scaler saved as scaler.pkl
Region encoder saved as region_encoder.pkl
Weather encoder saved as weather_encoder.pkl


In [34]:
import joblib
import pandas as pd
import numpy as np

# Load model and encoders
model = joblib.load("xgb_model.pkl")
scaler = joblib.load("scaler.pkl")
region_encoder = joblib.load("region_encoder.pkl")
weather_encoder = joblib.load("weather_encoder.pkl")

In [35]:
def preprocess_input(data: pd.DataFrame) -> pd.DataFrame:
    # Drop timestamp if exists
    if 'timestamp' in data.columns:
        data = data.drop(columns=['timestamp'])

    # Encode categorical variables
    data['region'] = region_encoder.transform(data['region'])
    data['weather'] = weather_encoder.transform(data['weather'])

    # Create derived features
    data['demand_inventory_ratio'] = data['demand'] / (data['inventory'] + 1e-5)
    data['is_peak_hour'] = data['hour'].apply(lambda x: 1 if x in [7, 8, 17, 18] else 0)
    data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

    # Reorder columns (excluding actual_price if included)
    feature_columns = [
        'hour', 'day_of_week', 'region', 'weather',
        'demand', 'inventory', 'competitor_price',
        'demand_inventory_ratio', 'is_peak_hour', 'is_weekend'
    ]
    data = data[feature_columns]

    # Scale selected features
    scaled_data = data.copy()
    scaled_features = ['demand', 'inventory', 'competitor_price', 'demand_inventory_ratio']
    scaled_data[scaled_features] = scaler.transform(scaled_data[scaled_features])

    return scaled_data


In [36]:
def predict_price(new_data_df: pd.DataFrame) -> np.ndarray:
    processed = preprocess_input(new_data_df)
    predictions = model.predict(processed)
    return predictions


In [41]:
new_data = pd.DataFrame([{
    'timestamp': '2025-05-22 04:26:25.504453',
    'hour': 4,
    'day_of_week': 3,
    'region': 'TX',
    'weather': 'Cloudy',
    'demand': 4,
    'inventory': 46,
    'competitor_price': 24.46
}])

predicted_price = predict_price(new_data)
print("Predicted Price:", predicted_price[0])


Predicted Price: 25.115547
