In [2]:
import numpy as np
import pandas as pd
import random
import math

The booking starts 30 days ahead of jouney date. The minimum and maximum price of the seat is set as below. There are total 250 seats available to be filled. We are simulating for 50 different journeys. Similarly we decide beforehand how much slots wastage we want to limit. And number of days before the journey that we want to optimize.

In [3]:
days = 30
min_price = 4000
max_price = 14000
total_slots = 250
journey = 50
train_val_prop = 0.75
lst_days = list(range(1,days+1))
wastage_pct_min_range = 15
wastage_pct_max_range = 20
emptiness_threshold = 0.02
optimisation_day_bfr_jouney = 10

The below code block simulates the data for 50 different journeys. The number of slots filled on a day is directly proportional to the day of journey, as more number of people will book the tickets as the journey gets nearer. Also the booking is inversely proportional of the price as people tend not to book it due to high price. After calculating weights and creating slots according to it, we then append all the journeys into a single data frame. 

In [4]:
lst_df = []
for j in range(journey):
    lst_price = []
    prev_price = min_price
    for i in range(days):
        cur_price = min(prev_price + random.randint(0, 500), max_price)
        prev_price = cur_price
        lst_price.append(cur_price)
        
    slots_filled = round((1 - random.randint(wastage_pct_min_range, wastage_pct_max_range) / 100)*total_slots, 0)
    lst_slots = []
    weights = []
    
    for time_, p in enumerate(lst_price):
        weights.append(((time_ + 1)*random.uniform(1, 1.2))/math.pow(p, 5))
    
    msum = sum(weights)
    weights = [w/msum for w in weights]
    
    for w in weights:
        lst_slots.append(round(w*slots_filled))
    
    lst_df.append(pd.DataFrame({"journey_id" : [j + 1]*days, "day" : lst_days, "price" : lst_price, "slots" : lst_slots}))
    
df = pd.concat(lst_df).reset_index()

In [5]:
df.head()

Unnamed: 0,index,journey_id,day,price,slots
0,0,1,1,4448,6
1,1,1,2,4496,10
2,2,1,3,4497,17
3,3,1,4,4973,12
4,4,1,5,5080,15


In [6]:
df_train = df[df["journey_id"] <= round(journey*train_val_prop)]
df_train

Unnamed: 0,index,journey_id,day,price,slots
0,0,1,1,4448,6
1,1,1,2,4496,10
2,2,1,3,4497,17
3,3,1,4,4973,12
4,4,1,5,5080,15
...,...,...,...,...,...
1135,25,38,26,9407,4
1136,26,38,27,9763,3
1137,27,38,28,10167,2
1138,28,38,29,10663,2


In [7]:
df_test = df[df["journey_id"] > round(journey*train_val_prop)]
df_test

Unnamed: 0,index,journey_id,day,price,slots
1140,0,39,1,4479,9
1141,1,39,2,4706,12
1142,2,39,3,4721,19
1143,3,39,4,5155,17
1144,4,39,5,5490,16
...,...,...,...,...,...
1495,25,50,26,10846,2
1496,26,50,27,11113,2
1497,27,50,28,11305,2
1498,28,50,29,11536,2


Now we train the model. We prdict the slots filled on a particular day based on the day and price.

In [8]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(df_train[["price", "day"]], df_train["slots"])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(df_train["slots"], model.predict(df_train[["price", "day"]]))
print("train mae", mae)
rmse = np.sqrt(mean_squared_error(df_train["slots"], model.predict(df_train[["price", "day"]])))
print("train rmse", rmse)

train mae 0.36746902967754164
train rmse 0.5137653659216114


In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(df_test["slots"], model.predict(df_test[["price", "day"]]))
print("test mae", mae)
rmse = np.sqrt(mean_squared_error(df_test["slots"], model.predict(df_test[["price", "day"]])))
print("test rmse", rmse)

test mae 1.0722295390235053
test rmse 1.644486788802486


We store the predictions in a dictionary for faster lookup, to be used in the optimization function

In [12]:
mp = {}
lst = []

for p in range(min_price, max_price+1):
    for d in range(days - optimisation_day_bfr_jouney, days+1):
        lst.append([p, d])

pred = model.predict(np.array(lst))

for i in range(len(lst)):
    mp[lst[i][0], lst[i][1]] = int(pred[i])


The below m_feasible function checks whether the price and the slots predicted are feasible or not. This is being used in the optimization layer

In [13]:
def m_feasible(price_points, available_slots, emptiness_threshold):
    slots_predicted = [mp[p] for p in price_points]
    total_slots_predicted = sum(slots_predicted)
    if (total_slots_predicted <= available_slots) and (total_slots * emptiness_threshold <= available_slots - total_slots_predicted):
        return True
    return False

The m_revenue function will take in the list of prices and calculates the revenue generated in the optimization days using the slots predicted

In [14]:
def m_revenue(price_points):
    slots_predicted = [mp[p] for p in price_points]
    revenue = 0
    for i in range(len(price_points)):
        revenue = revenue + price_points[i][0] * slots_predicted[i]
    return revenue, slots_predicted


The below optimization function kicks off some specific days before the date of journey. It creates a list of price points and using the model trained before predicts the price for it. We iterate 50000 times to find the best list of feasible price points which can give the highest revenue for us

In [18]:
def optimize(df, journey_id):
    df_tmp = df[(df["journey_id"] == journey_id) & (df["day"] > (days - optimisation_day_bfr_jouney))].reset_index(drop=True)
    
    slots_filled = df[(df["journey_id"] == journey_id) & (df["day"] < days - optimisation_day_bfr_jouney)]["slots"].sum()
    
    available_slots = total_slots - slots_filled
    
    times = 50000
    ans = 0
    solution = []
    for j in range(times):
        price_points = []
        prev_price = min_price
        for i in range(optimisation_day_bfr_jouney):
            cur_price = random.randint(prev_price, int(prev_price * 1.15))
            if cur_price > max_price:
                break
            prev_price = cur_price
            price_points.append((cur_price, (days + i + 1 - optimisation_day_bfr_jouney)))
        if m_feasible(price_points, available_slots, emptiness_threshold):
            if m_revenue(price_points)[0] > ans:
                ans, slots = m_revenue(price_points)
                solution = [p[0] for p in price_points]
                
    df_tmp["proposed_price"] = solution
    df_tmp["forecasted_slots"] = slots
    
    orig_rev = (df_tmp["price"] * df_tmp["slots"]).sum()
    proposed_rev = (df_tmp["proposed_price"] * df_tmp["forecasted_slots"]).sum()
    
    rev_gain = proposed_rev - orig_rev
    rev_gain_pct = round((rev_gain / orig_rev)*100, 2)
    
    slots_gain = df_tmp["forecasted_slots"].sum() - df_tmp["slots"].sum()
    slots_gain_pct = round(slots_gain / df_tmp["slots"].sum() * 100, 2)
    
    print(f"""
        Previous Revenue: {orig_rev}
        New Revenue: {proposed_rev}
        Revenue Gain: {rev_gain}
        Revenue Gain %: {rev_gain_pct}
        Available Slots: {available_slots}
        Previously filled: {df_tmp["slots"].sum()}
        Filled after dynamic Pricing: {df_tmp["forecasted_slots"].sum()}
        Slots gain: {slots_gain}
        Slots gain %: {slots_gain_pct}
    """"")
                

In [16]:
optimize(df, 1)


        Previous Revenue: 231262
        New Revenue: 406011
        Revenue Gain: 174749
        Revenue Gain %: 75.56
        Available Slots: 68
        Previously filled: 23
        Filled after dynamic Pricing: 63
        Slots gain: 40
        Slots gain %: 173.91
    


In [17]:
optimize(df, 2)


        Previous Revenue: 330978
        New Revenue: 506558
        Revenue Gain: 175580
        Revenue Gain %: 53.05
        Available Slots: 90
        Previously filled: 40
        Filled after dynamic Pricing: 85
        Slots gain: 45
        Slots gain %: 112.5
    
