<a href="https://colab.research.google.com/github/aravind2225/NullClass-Internship/blob/main/Dynamic_Pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [None]:
data=pd.read_csv("/content/dynamic_pricing.csv")
data.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [None]:
def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

The above cell is strictly only for data preprocessing (Mainly Cleaning)

In [None]:
data.isnull().sum()

Unnamed: 0,0
Number_of_Riders,0
Number_of_Drivers,0
Location_Category,0
Customer_Loyalty_Status,0
Number_of_Past_Rides,0
Average_Ratings,0
Time_of_Booking,0
Vehicle_Type,0
Expected_Ride_Duration,0
Historical_Cost_of_Ride,0


In [None]:
data.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


In [None]:
print(data.shape)
data.info()

(1000, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.3+ KB


In [None]:
px.scatter(data,
           x='Expected_Ride_Duration',
           y='Historical_Cost_of_Ride',
           trendline='ols',
           trendline_color_override='red',
           title='Relationship between Expected Ride Duration and Historical Cost of Ride')


In [None]:
px.box(
    data,
    x='Vehicle_Type',
    y='Historical_Cost_of_Ride',
    title='Box Plot of Historical Cost of Ride by Vehicle Type'
)

In [None]:
px.box(
    data,
    x='Time_of_Booking',
    y='Historical_Cost_of_Ride',
    title='Box Plot of Historical Cost of Ride by Vehicle Type'
)

In [None]:
corr_matrix=data.select_dtypes(include=['int64','float64']).corr()
px.imshow(
    corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale='RdBu',
    title='Correlation Heatmap'
)

The data provided by the company states that the company uses a pricing model that only takes the expected ride duration as a factor to determine the price for a ride. Now, we will implement a dynamic pricing strategy aiming to adjust the ride costs dynamically based on the demand and supply levels observed in the data. It will capture high-demand periods and low-supply scenarios to increase prices, while low-demand periods and high-supply situations will lead to price reductions.

In [None]:
high_demand_percentile=75
low_demand_percentile=25

data['Demand_Multiplier']=np.where(
    data['Number_of_Riders']>np.percentile(data['Number_of_Riders'],high_demand_percentile),
    data['Number_of_Riders']/np.percentile(data['Number_of_Riders'],high_demand_percentile),
    data['Number_of_Riders']/np.percentile(data['Number_of_Riders'],low_demand_percentile)
)

high_supply_percentile = 75
low_supply_percentile = 25

data['Supply_Multiplier'] = np.where(
    data['Number_of_Drivers']>np.percentile(data['Number_of_Drivers'],high_supply_percentile),
    data['Number_of_Drivers']/np.percentile(data['Number_of_Drivers'],high_supply_percentile),
    data['Number_of_Drivers']/np.percentile(data['Number_of_Drivers'],low_supply_percentile)
)


print(np.percentile(data['Number_of_Riders'],high_demand_percentile))
print(np.percentile(data['Number_of_Riders'],low_demand_percentile))
data

81.0
40.0


Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Demand_Multiplier,Supply_Multiplier
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273,1.111111,1.184211
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,1.450000,1.026316
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,1.050000,2.818182
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,1.098765,2.545455
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,1.950000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
995,33,23,Urban,Gold,24,4.21,Morning,Premium,11,91.389526,0.825000,2.090909
996,84,29,Urban,Regular,92,4.55,Morning,Premium,94,424.155987,1.037037,2.636364
997,44,6,Suburban,Gold,80,4.13,Night,Premium,40,157.364830,1.100000,0.545455
998,53,27,Suburban,Regular,78,3.63,Night,Premium,58,279.095048,1.325000,2.454545


In [None]:
# Define price adjustment factors for high and low demand/supply
demand_threshold_high = 1.2  # Higher demand threshold
demand_threshold_low = 0.8  # Lower demand threshold
supply_threshold_high = 0.8  # Higher supply threshold
supply_threshold_low = 1.2  # Lower supply threshold

# Calculate adjusted_ride_cost for dynamic pricing
data['adjusted_ride_cost'] = data['Historical_Cost_of_Ride'] * (
    np.maximum(data['Demand_Multiplier'], demand_threshold_low) *
    np.maximum(data['Supply_Multiplier'], supply_threshold_high)
)
data

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Demand_Multiplier,Supply_Multiplier,adjusted_ride_cost
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273,1.111111,1.184211,374.022728
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,1.450000,1.026316,258.753086
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,1.050000,2.818182,975.894774
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,1.098765,2.545455,1315.085824
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,1.950000,2.000000,2260.757547
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,33,23,Urban,Gold,24,4.21,Morning,Premium,11,91.389526,0.825000,2.090909,157.646932
996,84,29,Urban,Regular,92,4.55,Morning,Premium,94,424.155987,1.037037,2.636364,1159.645324
997,44,6,Suburban,Gold,80,4.13,Night,Premium,40,157.364830,1.100000,0.545455,138.481051
998,53,27,Suburban,Regular,78,3.63,Night,Premium,58,279.095048,1.325000,2.454545,907.693214


In the above code, we first calculated the demand multiplier by comparing the number of riders to percentiles representing high and low demand levels. If the number of riders exceeds the percentile for high demand, the demand multiplier is set as the number of riders divided by the high-demand percentile. Otherwise, if the number of riders falls below the percentile for low demand, the demand multiplier is set as the number of riders divided by the low-demand percentile.

Next, we calculated the supply multiplier by comparing the number of drivers to percentiles representing high and low supply levels. If the number of drivers exceeds the low-supply percentile, the supply multiplier is set as the high-supply percentile divided by the number of drivers. On the other hand, if the number of drivers is below the low-supply percentile, the supply multiplier is set as the low-supply percentile divided by the number of drivers.

Finally, we calculated the adjusted ride cost for dynamic pricing. It multiplies the historical cost of the ride by the maximum of the demand multiplier and a lower threshold (demand_threshold_low), and also by the maximum of the supply multiplier and an upper threshold (supply_threshold_high). This multiplication ensures that the adjusted ride cost captures the combined effect of demand and supply multipliers, with the thresholds serving as caps or floors to control the price adjustments.

Now let’s calculate the profit percentage we got after implementing this dynamic pricing strategy:

In [None]:
data['Profit_Percentage']=(data['adjusted_ride_cost']-data['Historical_Cost_of_Ride'])/data['Historical_Cost_of_Ride']*100
Profitable_rides=data[data['Profit_Percentage']>0]
Loss_rides=data[data['Profit_Percentage']<0]

px.pie(
    values=[len(Profitable_rides),len(Loss_rides)],
    hole=0.5,
    names=['Profit','Loss']
)

In [None]:
#Now we have to train the model with data set,
#first lets convert the Vehicle type to from category to numeric
data['Vehicle_Type']=data['Vehicle_Type'].map(({"Premium": 1,
                                           "Economy": 0}))

In [None]:
#Train Test Splitting the data
x=np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])
y=np.array(data['adjusted_ride_cost'])
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [None]:
#Building a Model Now.
from xgboost import XGBRegressor
model=XGBRegressor(
    n_estimators=300
    )
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)*100

91.85549950029515

In [None]:
model.predict([[50,25,0,30]])

array([318.5739], dtype=float32)

In [None]:
def get_vehicle_type_numeric(vehicle_type):
    vehicle_type_mapping = {
        "Premium": 1,
        "Economy": 0
    }
    vehicle_type_numeric = vehicle_type_mapping.get(vehicle_type)
    return vehicle_type_numeric

# Predicting using user input values
def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration):
    vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid vehicle type")

    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]])
    predicted_price = model.predict(input_data)
    return predicted_price

In [None]:
user_number_of_riders = 50
user_number_of_drivers = 25
user_vehicle_type = "Economy"
Expected_Ride_Duration = 30
predicted_Price=predict_price(user_number_of_riders,user_number_of_drivers,user_vehicle_type,Expected_Ride_Duration)
print(f"Predicted Price: {predicted_Price}")

Predicted Price: [318.7244]


In [None]:
px.scatter(
    x=y_test,
    y=y_pred,
    labels={'x': 'Actual', 'y': 'Predicted'}
)


In [None]:
import pickle

# Save the model to a pickle file
filename = 'xgboost_model.pkl'
pickle.dump(model, open(filename, 'wb'))