In [132]:
import pandas as pd
import numpy as np
import random
from datetime import datetime,timedelta

In [133]:
n=200002
random.seed(42)
np.random.seed(42)

In [134]:
areas = {
    "Shyambazar": "high",
    "Esplanade": "high",
    "Park Street": "high",
    "Sealdah": "high",
    "Dum Dum": "medium",
    "Bagbazar": "medium",
    "Tollygunge": "medium",
    "Jadavpur": "medium",
    "Garia": "medium",
    "Salt Lake": "low",
    "New Town": "low",
    "Rajarhat": "low"
}
area_names=list(areas.keys())


In [135]:
start_date=datetime(2024,1,1)
timestamps = []

for i in range(n):
    if random.random() < 0.80:
        hour = random.randint(7, 23)
    else:
        hour = random.randint(0, 6)

    day_offset = random.randint(0, 364) 
    minute = random.randint(0, 59)

    timestamps.append(start_date + timedelta(days=day_offset, hours=hour, minutes=minute))


In [136]:
probs = [0.12, 0.11, 0.10, 0.09, 0.08,
                0.08, 0.10, 0.08, 0.07, 0.07,
                0.05, 0.05]
df=pd.DataFrame({
    "ride_id":np.arange(1,n+1),
    "pickup_location":np.random.choice(area_names,size=n,p=probs),
    "drop_location":np.random.choice(area_names,size=n,p=probs),
    "request_timestamp": timestamps
})

df["ride_hour"] = df["request_timestamp"].dt.hour
df["day_of_week"] = df["request_timestamp"].dt.day_name()
df["month"] = df["request_timestamp"].dt.month

In [137]:
df["normal_traffic"] = df["pickup_location"].map(areas)

In [138]:

def real_traffic(row):
    temp=row["normal_traffic"]
    hour=row["ride_hour"]
    if 0<=hour<=7:
        return "low"
    if temp == "low":
        if 8 <= hour<= 11 or 17 <= hour<= 21:
            return "medium"
        return "low"
    elif temp == "medium":
        if 8 <= hour<= 11 or 17 <= hour<= 21:
            return "high"
        return "medium"
    else:  
        return "high"

In [139]:
df["traffic_level"] = df.apply(real_traffic, axis=1)
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,normal_traffic,traffic_level
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,high,low
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,low,medium
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,medium,low
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,medium,high
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,high,low


In [140]:
df.drop("normal_traffic",inplace=True,axis=1)
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,traffic_level
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,low
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,medium
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,low
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,high
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,low


In [141]:
def sample_weather(month):
    if month in [6,7,8,9]:
        return np.random.choice(["clear", "rain", "heavy_rain"], p=[0.6, 0.3, 0.1])
    else:
        return np.random.choice(["clear", "rain", "heavy_rain"], p=[0.8, 0.18, 0.02])

df["weather"] = df["month"].apply(sample_weather)
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,traffic_level,weather
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,low,clear
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,medium,clear
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,low,clear
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,high,clear
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,low,clear


In [142]:
vehicle_types = ["auto", "yellow_taxi", "sedan", "suv", "bike"]
vehicle_probs  = [0.30, 0.25, 0.20, 0.10, 0.15]

df["vehicle_type"] = np.random.choice(vehicle_types, size=n, p=vehicle_probs)


In [143]:
distance_matrix = {
    "Shyambazar": {
        "Shyambazar": 0, "Esplanade": 6, "Park Street": 7, "Sealdah": 5, 
        "Dum Dum": 5, "Bagbazar": 2, "Tollygunge": 14, "Jadavpur": 16,
        "Garia": 18, "Salt Lake": 9, "New Town": 14, "Rajarhat": 15
    },

    "Esplanade": {
        "Shyambazar": 6, "Esplanade": 0, "Park Street": 2, "Sealdah": 3, 
        "Dum Dum": 10, "Bagbazar": 5, "Tollygunge": 10, "Jadavpur": 11,
        "Garia": 13, "Salt Lake": 8, "New Town": 13, "Rajarhat": 14
    },

    "Park Street": {
        "Shyambazar": 7, "Esplanade": 2, "Park Street": 0, "Sealdah": 4, 
        "Dum Dum": 11, "Bagbazar": 6, "Tollygunge": 9, "Jadavpur": 10,
        "Garia": 12, "Salt Lake": 7, "New Town": 12, "Rajarhat": 13
    },

    "Sealdah": {
        "Shyambazar": 5, "Esplanade": 3, "Park Street": 4, "Sealdah": 0,
        "Dum Dum": 9, "Bagbazar": 4, "Tollygunge": 11, "Jadavpur": 12,
        "Garia": 14, "Salt Lake": 6, "New Town": 11, "Rajarhat": 12
    },

    "Dum Dum": {
        "Shyambazar": 5, "Esplanade": 10, "Park Street": 11, "Sealdah": 9,
        "Dum Dum": 0, "Bagbazar": 6, "Tollygunge": 15, "Jadavpur": 16,
        "Garia": 18, "Salt Lake": 9, "New Town": 10, "Rajarhat": 11
    },

    "Bagbazar": {
        "Shyambazar": 2, "Esplanade": 5, "Park Street": 6, "Sealdah": 4,
        "Dum Dum": 6, "Bagbazar": 0, "Tollygunge": 13, "Jadavpur": 15,
        "Garia": 17, "Salt Lake": 8, "New Town": 13, "Rajarhat": 14
    },

    "Tollygunge": {
        "Shyambazar": 14, "Esplanade": 10, "Park Street": 9, "Sealdah": 11,
        "Dum Dum": 15, "Bagbazar": 13, "Tollygunge": 0, "Jadavpur": 3,
        "Garia": 5, "Salt Lake": 14, "New Town": 18, "Rajarhat": 19
    },

    "Jadavpur": {
        "Shyambazar": 16, "Esplanade": 11, "Park Street": 10, "Sealdah": 12,
        "Dum Dum": 16, "Bagbazar": 15, "Tollygunge": 3, "Jadavpur": 0,
        "Garia": 4, "Salt Lake": 13, "New Town": 17, "Rajarhat": 18
    },

    "Garia": {
        "Shyambazar": 18, "Esplanade": 13, "Park Street": 12, "Sealdah": 14,
        "Dum Dum": 18, "Bagbazar": 17, "Tollygunge": 5, "Jadavpur": 4,
        "Garia": 0, "Salt Lake": 16, "New Town": 20, "Rajarhat": 21
    },

    "Salt Lake": {
        "Shyambazar": 9, "Esplanade": 8, "Park Street": 7, "Sealdah": 6,
        "Dum Dum": 9, "Bagbazar": 8, "Tollygunge": 14, "Jadavpur": 13,
        "Garia": 16, "Salt Lake": 0, "New Town": 6, "Rajarhat": 7
    },

    "New Town": {
        "Shyambazar": 14, "Esplanade": 13, "Park Street": 12, "Sealdah": 11,
        "Dum Dum": 10, "Bagbazar": 13, "Tollygunge": 18, "Jadavpur": 17,
        "Garia": 20, "Salt Lake": 6, "New Town": 0, "Rajarhat": 4
    },

    "Rajarhat": {
        "Shyambazar": 15, "Esplanade": 14, "Park Street": 13, "Sealdah": 12,
        "Dum Dum": 11, "Bagbazar": 14, "Tollygunge": 19, "Jadavpur": 18,
        "Garia": 21, "Salt Lake": 7, "New Town": 4, "Rajarhat": 0
    }
}


In [144]:
df["distance_km"] = df.apply(
    lambda x: distance_matrix[x["pickup_location"]][x["drop_location"]],axis=1
)

In [145]:
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,traffic_level,weather,vehicle_type,distance_km
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,low,clear,auto,5
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,medium,clear,yellow_taxi,11
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,low,clear,bike,11
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,high,clear,sedan,9
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,low,clear,sedan,10


In [146]:
vehicle_speed = {
    "auto": 30,
    "yellow_taxi": 40,
    "sedan": 45,
    "suv": 45,
    "bike": 40
}

traffic_speed_factor = {
    "low": 1.0,  
    "medium": 0.8, 
    "high": 0.55  
}

weather_speed_factor = {
    "clear": 1.0,
    "rain": 0.85,
    "heavy_rain": 0.7
}

In [147]:
def compute_duration(x):
    base_speed = vehicle_speed[x["vehicle_type"]]  
    speed= base_speed  * traffic_speed_factor[x["traffic_level"]]  * weather_speed_factor[x["weather"]]
    distance = x["distance_km"]          
    duration_hours = distance / speed      
    duration_minutes = duration_hours * 60
    duration_minutes += np.random.uniform(-3, 5)
    return  round(max(duration_minutes,3), 1)

In [148]:
df["duration_min"] = df.apply(compute_duration, axis=1)

In [149]:
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,traffic_level,weather,vehicle_type,distance_km,duration_min
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,low,clear,auto,5,7.5
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,medium,clear,yellow_taxi,11,24.1
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,low,clear,bike,11,14.3
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,high,clear,sedan,9,22.8
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,low,clear,sedan,10,16.5


In [150]:
def compute_surge(x):
    h = x["ride_hour"]
    day = x["day_of_week"]
    traffic = x["traffic_level"]
    
    surge = 1.0 
    if 8 <= h <= 11 or 17 <= h <= 21:
        if traffic == "high":
            surge = np.random.uniform(1.3, 1.5)
        elif traffic == "medium":
            surge = np.random.uniform(1.1, 1.3)
        else:
            surge = np.random.uniform(1.0, 1.2)

    if day in ["Saturday", "Sunday"] and 18 <= h <= 23:
        weekend_surge = np.random.uniform(1.2, 1.5)
        surge = max(surge, weekend_surge)
    
    surge = min(max(surge, 1.0), 2.2)
    return round(surge, 2)

df["surge_multiplier"] = df.apply(compute_surge, axis=1)


In [151]:
def simple_driver_rating():
    r = np.random.rand()

    if r < 0.70:
        return round(np.random.uniform(3.6, 4.3), 1) 
    elif r < 0.90:
        return round(np.random.uniform(4.4, 5.0), 1)  
    else:
        return round(np.random.uniform(2.5, 3.5), 1) 

df["driver_rating"] = [simple_driver_rating() for _ in range(len(df))]

In [152]:
min_base_fare = {
    "auto": 10,
    "yellow_taxi": 30,
    "sedan": 50,
    "suv": 70,
    "bike": 15
}

rate_per_km = {
    "auto": 12,
    "yellow_taxi": 15,
    "sedan": 20,
    "suv": 26,
    "bike": 8
}

traffic_extra = {"low": 0, "medium": 5, "high": 12}
weather_extra = {"clear": 0, "rain": 5, "heavy_rain": 12}


def compute_fare(x):
    base = min_base_fare[x["vehicle_type"]]
    distance_cost = x["distance_km"] * rate_per_km[x["vehicle_type"]]
    traffic_cost = traffic_extra[x["traffic_level"]]
    weather_cost = weather_extra[x["weather"]]
    night_cost = 15 if (x["ride_hour"] >= 22 or x["ride_hour"] <= 5) else 0

    fare = base + distance_cost + traffic_cost + weather_cost + night_cost
    fare *= x["surge_multiplier"]

    fare += np.random.uniform(-5, 15)

    return round(max(fare,15),2)


df["fare"] = df.apply(compute_fare, axis=1)


In [153]:
df.head()

Unnamed: 0,ride_id,pickup_location,drop_location,request_timestamp,ride_hour,day_of_week,month,traffic_level,weather,vehicle_type,distance_km,duration_min,surge_multiplier,driver_rating,fare
0,1,Sealdah,Shyambazar,2024-05-20 07:15:00,7,Monday,5,low,clear,auto,5,7.5,1.0,4.2,69.09
1,2,Rajarhat,Dum Dum,2024-12-12 10:47:00,10,Thursday,12,medium,clear,yellow_taxi,11,24.1,1.26,3.8,264.95
2,3,Jadavpur,Esplanade,2024-10-29 00:27:00,0,Tuesday,10,low,clear,bike,11,14.3,1.0,4.4,121.85
3,4,Tollygunge,Park Street,2024-04-21 09:14:00,9,Sunday,4,high,clear,sedan,9,22.8,1.37,4.1,331.66
4,5,Esplanade,Dum Dum,2024-10-14 07:12:00,7,Monday,10,low,clear,sedan,10,16.5,1.0,4.8,252.49


In [154]:
df.to_csv(r"C:\Users\nasim\Desktop\Namma Yatri\data\ride_data.csv",index=False)