## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load the dataset

In [None]:
df= pd.read_csv(r"C:\Users\sanoj\Downloads\dynamic_pricing (1).csv")
df.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [None]:
df.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


# Ratio Based Approach

## Calculating Demand_Supply_Ratio and picking demand_supply_threshold = 2.3 arround the mean of Demand_Supply_Ratio
* ### Higher Demand = when 'Demand_Supply_Ratio' > demand_supply_threshold (2.3) else Low-demand
* ### Higher supply = when 'Demand_Supply_Ratio' < demand_supply_threshold (2.3) else Low-supply



In [5]:
df['Demand_Supply_Ratio'] = df['Number_of_Riders'] / df['Number_of_Drivers']
demand_supply_threshold = 2.3

df['Demand_class'] = np.where(df['Demand_Supply_Ratio'] > demand_supply_threshold, "Higher_demand", "Lower_demand")
df['Supply_class'] = np.where(df['Demand_Supply_Ratio'] < demand_supply_threshold, "Higher_supply", "Lower_supply")

df.iloc[:,[0,1,10,11,12]].sample(10)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Demand_Supply_Ratio,Demand_class,Supply_class
458,96,27,3.555556,Higher_demand,Lower_supply
933,96,36,2.666667,Higher_demand,Lower_supply
13,36,24,1.5,Lower_demand,Higher_supply
599,80,67,1.19403,Lower_demand,Higher_supply
124,26,15,1.733333,Lower_demand,Higher_supply
618,96,48,2.0,Lower_demand,Higher_supply
917,87,41,2.121951,Lower_demand,Higher_supply
427,84,21,4.0,Higher_demand,Lower_supply
75,47,9,5.222222,Higher_demand,Lower_supply
33,35,24,1.458333,Lower_demand,Higher_supply


## calulation Base Price and Surge_charge based on supply demand ratio and demand_supply_factor
* ### 1. Calculate base historical cost based on expected_Ride_duration
* ### 2. Calculate rider-to-driver ratio
* ### 3. Calculate demand-supply factor
* ### 4. Defining a methode to Calculate supply_demand_surge and Apply the dynamic pricing formula

In [None]:

constant_rate = 3.5  # Define the base rate per unit of duration, this is arround mean of ratio of ('Historical_Cost_of_Ride'/'Expected_Ride_Duration')
demand_hike = 0.35  # This is how much demand increase the pricing

# Calculate base historical cost based on expected_Ride_duration
df['base_cost'] = df['Expected_Ride_Duration'] * constant_rate

# Calculate rider-to-driver ratio
df['rider_driver_ratio'] = df['Number_of_Riders'] / df['Number_of_Drivers']

# Calculate demand-supply factor
df['demand_supply_factor'] = df['rider_driver_ratio'] - 1
df['demand_supply_factor'] = df['demand_supply_factor'].apply(lambda x: min(x, 6))

# defining a methode to Calculate supply_demand_surge and Apply the dynamic pricing formula
def apply_surge(df):
    SD_surge_charge=0
    if (df['Demand_class']=='Higher_demand') & (df['Supply_class']=='Lower_supply'):
        SD_surge_charge = df['base_cost'] * (demand_hike * df['demand_supply_factor'])
    return SD_surge_charge

df['S/D_surge_charge'] = df.apply(apply_surge,axis=1)
df.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Demand_Supply_Ratio,Demand_class,Supply_class,base_cost,rider_driver_ratio,demand_supply_factor,S/D_surge_charge
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273,2.0,Lower_demand,Higher_supply,315.0,2.0,1.0,0.0
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,1.487179,Lower_demand,Higher_supply,150.5,1.487179,0.487179,0.0
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,1.354839,Lower_demand,Higher_supply,266.0,1.354839,0.354839,0.0
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,3.178571,Higher_demand,Lower_supply,469.0,3.178571,2.178571,357.6125
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,3.545455,Higher_demand,Lower_supply,521.5,3.545455,2.545455,464.609091


## Conditional Surge based on Vehical_Type and Time_of_booking && Location_Category Condition


In [7]:

def cal_surge_charge(df):
    surge_charge = 0
    if df['Vehicle_Type'] == 'Premium':
        if (df['Location_Category'] in ('Urban', 'Suburban')) & (df['Time_of_Booking'] in ('Evening', 'Night')):
            surge_charge = df['base_cost'] * 0.05 + df['base_cost'] * 0.02
    else:
        if (df['Location_Category'] in ('Urban', 'Suburban')) & (df['Time_of_Booking'] in ('Evening', 'Night')):
            surge_charge = df['base_cost'] * 0.025 + df['base_cost'] * 0.01
    return surge_charge

df['Surge_charge'] = df.apply(cal_surge_charge, axis=1)


In [8]:
df.sample(10)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Demand_Supply_Ratio,Demand_class,Supply_class,base_cost,rider_driver_ratio,demand_supply_factor,S/D_surge_charge,Surge_charge
723,76,35,Suburban,Regular,8,4.89,Afternoon,Premium,138,406.589911,2.171429,Lower_demand,Higher_supply,483.0,2.171429,1.171429,0.0,0.0
701,23,5,Urban,Gold,92,3.71,Night,Economy,105,363.444776,4.6,Higher_demand,Lower_supply,367.5,4.6,3.6,463.05,12.8625
241,88,55,Rural,Regular,56,4.0,Night,Premium,26,159.606919,1.6,Lower_demand,Higher_supply,91.0,1.6,0.6,0.0,0.0
633,91,53,Suburban,Gold,67,4.56,Night,Economy,50,158.106723,1.716981,Lower_demand,Higher_supply,175.0,1.716981,0.716981,0.0,6.125
109,40,15,Urban,Regular,90,3.62,Night,Premium,42,211.527729,2.666667,Higher_demand,Lower_supply,147.0,2.666667,1.666667,85.75,10.29
415,94,27,Suburban,Silver,100,3.6,Night,Premium,172,547.981462,3.481481,Higher_demand,Lower_supply,602.0,3.481481,2.481481,522.848148,42.14
429,25,11,Urban,Silver,92,3.91,Night,Economy,115,310.154616,2.272727,Lower_demand,Higher_supply,402.5,2.272727,1.272727,0.0,14.0875
992,63,31,Suburban,Silver,2,3.81,Evening,Premium,160,632.560142,2.032258,Lower_demand,Higher_supply,560.0,2.032258,1.032258,0.0,39.2
387,71,7,Urban,Gold,31,4.83,Morning,Premium,60,292.74683,10.142857,Higher_demand,Lower_supply,210.0,10.142857,6.0,441.0,0.0
214,49,30,Rural,Silver,86,3.62,Afternoon,Premium,69,328.635369,1.633333,Lower_demand,Higher_supply,241.5,1.633333,0.633333,0.0,0.0


## Calculating Total cost

In [9]:
df['New_cost']= df['base_cost'] + df['S/D_surge_charge'] + df['Surge_charge']
df.iloc[:,[0,1,9,10,11,12,13,16,17,18]].sample(10)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Historical_Cost_of_Ride,Demand_Supply_Ratio,Demand_class,Supply_class,base_cost,S/D_surge_charge,Surge_charge,New_cost
917,87,41,158.89198,2.121951,Lower_demand,Higher_supply,129.5,0.0,0.0,129.5
472,85,65,770.070891,1.307692,Lower_demand,Higher_supply,577.5,0.0,0.0,577.5
462,44,11,104.75201,4.0,Higher_demand,Lower_supply,63.0,66.15,0.0,129.15
908,27,12,279.2687,2.25,Lower_demand,Higher_supply,192.5,0.0,0.0,192.5
256,92,37,434.571051,2.486486,Higher_demand,Lower_supply,360.5,187.557432,0.0,548.057432
916,61,39,230.143145,1.564103,Lower_demand,Higher_supply,161.0,0.0,0.0,161.0
339,49,14,633.496622,3.5,Higher_demand,Lower_supply,458.5,401.1875,0.0,859.6875
729,71,41,614.636713,1.731707,Lower_demand,Higher_supply,490.0,0.0,0.0,490.0
583,97,25,332.229376,3.88,Higher_demand,Lower_supply,353.5,356.328,24.745,734.573
638,65,51,588.13677,1.27451,Lower_demand,Higher_supply,504.0,0.0,35.28,539.28


## Revenue Before and after

In [10]:
print("Revenue before applying Dynamic_pricing -->",round(sum(df['Historical_Cost_of_Ride']),2))
print("Revenue after applying Dynamic_pricing-->",round(sum(df['New_cost']),2))

Revenue before applying Dynamic_pricing --> 372502.62
Revenue after applying Dynamic_pricing--> 552298.01


In [11]:
diff=sum(df['New_cost'])-sum(df['Historical_Cost_of_Ride'])
print("Diffrenece of Revenue--> ", diff)
print("Revenue Percentage --> ", diff/sum(df['Historical_Cost_of_Ride'])*100)

Diffrenece of Revenue-->  179795.39090132003
Revenue Percentage -->  48.26687911203326


## Conclusion
* ### Diffrenece of Revenue-->  179795.39
* ### Revenue Percentage -->  48.26

In [12]:
filter=df['Demand_Supply_Ratio']>10
df[filter].head(10)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Demand_Supply_Ratio,Demand_class,Supply_class,base_cost,rider_driver_ratio,demand_supply_factor,S/D_surge_charge,Surge_charge,New_cost
49,67,6,Rural,Gold,15,3.53,Night,Economy,123,420.623911,11.166667,Higher_demand,Lower_supply,430.5,11.166667,6.0,904.05,0.0,1334.55
88,66,6,Rural,Regular,23,4.2,Evening,Economy,45,173.157754,11.0,Higher_demand,Lower_supply,157.5,11.0,6.0,330.75,0.0,488.25
94,95,7,Rural,Gold,40,4.68,Evening,Economy,95,283.466443,13.571429,Higher_demand,Lower_supply,332.5,13.571429,6.0,698.25,0.0,1030.75
153,51,5,Urban,Gold,0,4.59,Afternoon,Premium,92,320.857622,10.2,Higher_demand,Lower_supply,322.0,10.2,6.0,676.2,0.0,998.2
170,76,7,Urban,Gold,76,4.35,Morning,Economy,72,245.893571,10.857143,Higher_demand,Lower_supply,252.0,10.857143,6.0,529.2,0.0,781.2
197,75,7,Suburban,Gold,100,4.13,Morning,Economy,134,453.376949,10.714286,Higher_demand,Lower_supply,469.0,10.714286,6.0,984.9,0.0,1453.9
216,88,5,Urban,Silver,89,3.59,Night,Economy,27,70.203803,17.6,Higher_demand,Lower_supply,94.5,17.6,6.0,198.45,3.3075,296.2575
218,65,5,Rural,Silver,24,3.54,Night,Economy,119,301.403927,13.0,Higher_demand,Lower_supply,416.5,13.0,6.0,874.65,0.0,1291.15
232,87,5,Urban,Silver,59,4.32,Night,Economy,42,151.359301,17.4,Higher_demand,Lower_supply,147.0,17.4,6.0,308.7,5.145,460.845
250,97,7,Urban,Silver,22,3.74,Afternoon,Premium,147,441.746701,13.857143,Higher_demand,Lower_supply,514.5,13.857143,6.0,1080.45,0.0,1594.95


# Data Spliting and Model training

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error

In [14]:
x=df[['Demand_Supply_Ratio','Location_Category','Time_of_Booking','Vehicle_Type','Expected_Ride_Duration']] #Train column
y=df['New_cost'] #Target column

In [15]:
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [None]:
X_train

Unnamed: 0,Demand_Supply_Ratio,Location_Category,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration
876,1.126437,Suburban,Afternoon,Economy,13
326,1.700000,Urban,Morning,Premium,158
381,9.142857,Rural,Evening,Economy,64
853,1.282051,Urban,Morning,Premium,135
311,2.266667,Urban,Night,Premium,134
...,...,...,...,...,...
118,4.043478,Suburban,Evening,Premium,125
334,3.700000,Urban,Afternoon,Economy,44
409,3.142857,Urban,Morning,Premium,98
225,7.571429,Urban,Night,Premium,10


In [17]:
y_train

876     45.500000
326    553.000000
381    694.400000
853    472.500000
311    501.830000
          ...    
118    934.157609
334    299.530000
409    600.250000
225    110.950000
482    472.546974
Name: New_cost, Length: 600, dtype: float64

In [18]:
ohe = OneHotEncoder()
ohe.fit(x[['Location_Category','Time_of_Booking','Vehicle_Type']])

In [19]:
cat=ohe.categories_

In [20]:
column_trans = make_column_transformer((OneHotEncoder(categories=cat),
                                        ['Location_Category','Time_of_Booking','Vehicle_Type']),
                                        remainder='passthrough')                                        

In [21]:
rf=RandomForestRegressor()                                    
pipe=make_pipeline(column_trans,rf)
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [22]:
y_pred_rf=pipe.predict(X_test)
print(y_pred_rf)

[ 915.4596156   657.08323333 1033.74770036  317.28935     166.70119776
  634.99466688  413.4604587   155.839425    810.96457616  230.39685252
  334.8758      216.23        135.1618678   976.60725276  394.3723
  375.64905     612.067925    693.45123837  154.28985066  455.96070383
  672.51753769  124.73459356  330.72421726  376.98675     356.74703961
  612.42125     159.701325   1352.75130189  196.14336875 1156.25918333
  336.08855      88.5206      411.6616      148.009225    613.17265633
  338.551325    132.68338125  357.43554904  618.24598476  339.920525
  915.3576401   839.92581349 1401.7639684   972.45887696 1829.04145278
  398.126225    530.75895623 1069.54291682  306.68715     638.36966459
  535.50504982  127.57955     515.45445     207.7355      161.73359028
  549.97825973  365.369025   1445.66540386  884.67301088  516.2213
  706.94834857  592.71386111  482.649125    403.02605     897.0479656
  535.51818942  402.91850278  398.58471532  529.73398318  217.69895
  720.56004364  161.

### Model Evaluation & Check prediction 

In [23]:
# 1. Model evaluation on validation Data
y_val_pred_rf = pipe.predict(X_val)
mape = mean_absolute_percentage_error(y_val,y_val_pred_rf)
r2 = r2_score(y_val,y_val_pred_rf)
print("Error of RandomForest Regression Model = %.2f"%(mape*100),'%')
print("Accuracy of RandomForest Regression Model = %.2f"%((1 - mape)*100),'%')
print("R2 score of RandomForest Regression = %.2f"%(r2))

Error of RandomForest Regression Model = 3.93 %
Accuracy of RandomForest Regression Model = 96.07 %
R2 score of RandomForest Regression = 0.99


In [29]:
# 2. Model evaluation on Test Data
y_test_pred_rf = pipe.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred_rf)
test_r2 = r2_score(y_test, y_test_pred_rf)
print("Mean square error :", test_mse)
print("R2 score of RandomForestr Regression on test data:", test_r2)

Mean square error : 25804.010772131278
R2 score of RandomForestr Regression on test data: 0.8308779931645754


In [25]:
#using Linearregression
lr=LinearRegression()                                    
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)

y_pred_lr=pipe.predict(X_test)
print(y_pred_lr)

[ 725.49985318  508.1060693   988.29705641  393.26549241  318.97656193
  531.98143912  423.53447011  119.30788357  693.75509236  169.27977397
  398.63716732  210.36750737  193.75497913  889.54894577  497.90395848
  500.94705098  880.32441075  625.99665575  116.33528796  451.19424188
  614.82082844  291.05957571  297.44714832  465.99765218  291.73430176
  914.86394215   99.02837815 1400.26007905  203.81191271 1115.29983168
  377.38545155   24.37875791  543.65000518  111.74637726  528.84416851
  374.28285041  121.44603573  297.59227745  515.23845114  462.21432185
  876.56109658  815.45941023 1008.81681601  949.42843104 1314.13789469
  516.99502246  422.34182859  973.97749499  335.00744433  556.77384055
  450.69908446   84.22266332  662.45194721  249.6198604   134.94326023
  451.84849959  422.47184824 1032.20481351  875.95829501  721.90047544
  646.89450429  550.647363    636.85153405  582.24481034  739.26236395
  489.74335188  513.95178678  373.04786679  488.90856772  236.05905468
  635.

### Model Evaluation & Check prediction 

In [26]:
# 1. Model evaluation on validation Data
y_val_pred_lr = pipe.predict(X_val)
mape = mean_absolute_percentage_error(y_val,y_val_pred_lr)
r2 = r2_score(y_val,y_val_pred_lr)
print("Error of Linear Regression Model = %.2f"%(mape*100),'%')
print("Accuracy of Linear Regression Model = %.2f"%((1 - mape)*100),'%')
print("R2 score of Linear Regression = %.2f"%(r2))

Error of Linear Regression Model = 29.48 %
Accuracy of Linear Regression Model = 70.52 %
R2 score of Linear Regression = 0.81


In [27]:
# 2. Model evaluation on Test Data
y_test_pred_lr = pipe.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred_lr)
test_r2 = r2_score(y_test, y_test_pred_lr)
print("Mean square error :", test_mse)
print("R2 score of Linear Regression on test data:", test_r2)

Mean square error : 25804.010772131278
R2 score of Linear Regression on test data: 83.08779931645755


In [28]:
results_df = pd.DataFrame(X_test, columns=x.columns)
results_df['Actual'] = y_test
results_df['Predicted_RandomForest'] = y_pred_rf
results_df['Predicted_LinearRegression'] = y_pred_lr
results_df['Error_RF'] = y_pred_rf-y_test
results_df['Error_LR'] = y_pred_lr-y_test
results_df.to_csv("test_results.csv", index=False)