In [2]:
import pandas as pd 
import numpy as np 
from datetime import datetime , timedelta
import random

In [3]:
#setting random seed for reproducing consistent results
np.random.seed(42)
random.seed(42)

In [4]:
#creating a list for storing locations for dataset
Mumbai_localities = [
    'Kalyan East', 'Kalyan West' , 'Colaba' , 'Borivali East','Borivali West', 'Dadar East','Dadar West','CSMT','Andheri East','Andheri West',
    'Juhu','Goregaon East','Goregaon West','Bhayandar East','Bhayandar West','Santacruz','Bandra East','Bandra West','kandivali East','kandivali West',
    'Vile Parle East','Vile Parle West', 'Worli', 'Lower Parel', 'Matunga', 'Ghatkopar East', 'Ghatkopar West', 'Vikhroli', 'Khar West', 'Versova',
    'Jogeshwari' ,'Kurla East','Kurla West', 'Malad East', 'Malad West', 'Mulund East', 'Mulund West', 'Powai','Marine lines'
    ]

In [5]:
cuisine_type = ['North indian','South indian','Asian','Japanese','Chinese', 'Thai','Italian','Continental','Desserts','Beverages','Fast food','Street food'
'Sea food','Tibetan','Korean','American','Lebanese','Mughlai']

In [6]:
Weather_conditions = ['Cloudy','Rainy','Clear','Light rains','Heavy rains']

In [7]:
vehicle_type =['Bike','Scooty','Bicycle']

In [8]:
#generating restuarant data
def generate_restaurants(n_restaurants=150):
    restaurants=[]
    for i in range(n_restaurants):
        restaurants.append({
            'restaurant_id': f'REST_{i+1:04d}',
            'restaurant_name':f'Restaurant_{i+1}',
            'locality': random.choice(Mumbai_localities),
            'cuisine_type': random.choice(cuisine_type),
            'avg_prep_time_min': np.random.randint(15,30),
            'restaurant_rating': round(np.random.uniform(3.0,5.0), 1),
            'is_premium': random.choice([True,False])
        })
    return pd.DataFrame(restaurants)

In [9]:
df = generate_restaurants(10)
print(df.head())

  restaurant_id restaurant_name        locality  cuisine_type  \
0     REST_0001    Restaurant_1            CSMT  North indian   
1     REST_0002    Restaurant_2       Santacruz   Continental   
2     REST_0003    Restaurant_3      Dadar West         Asian   
3     REST_0004    Restaurant_4          Colaba  North indian   
4     REST_0005    Restaurant_5  Bhayandar East   Continental   

   avg_prep_time_min  restaurant_rating  is_premium  
0                 21                4.6       False  
1                 29                4.5        True  
2                 27                4.2       False  
3                 24                3.3        True  
4                 25                3.9        True  


In [10]:
#Generate delivery partners 
def generate_delivery_partners(n_partners=300):
    partners =[]
    for i in range(n_partners):
        partners.append({
            'partner_id': f'DEL_{i+1:04d}',
            'partner_name':f'Partner_{i+1}',
            'vehicle_type': random.choice(vehicle_type),
            'experience_months': np.random.randint(1,60),
            'avg_rating': round(np.random.uniform(3.5,5.0),1)
        })
    return pd.DataFrame(partners)

In [29]:
df = generate_delivery_partners(10)
print(df.head())

  partner_id partner_name Vehicle_type  experience_months  avg_rating
0   DEL_0001    Partner_1         Bike                  6         3.8
1   DEL_0002    Partner_2         Bike                  4         3.8
2   DEL_0003    Partner_3         Bike                 18         4.7
3   DEL_0004    Partner_4       Scooty                 34         4.1
4   DEL_0005    Partner_5       Scooty                 36         4.4


In [11]:
#calculate realistic distance between localities
def get_distance(locality1,locality2):
    if locality1 == locality2:
        return np.random.uniform(0.5,2.0)  #same locality hence shorter distance
    else:
        base_distance = np.random.uniform(3,15)
        return round(base_distance,2)

In [17]:
#generate orders 
def generate_orders(n_orders=5000, start_date='2024-01-01', end_date='2024-12-31'):
    restaurants_df = generate_restaurants()
    partners_df=generate_delivery_partners()

    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')

    orders=[]

    for i in range(n_orders):
      random_date = start + timedelta(days=random.randint(0, (end - start).days))
      hour = random.choices(
        range(24),
        weights=[1,1,1,1,1,2,3,5,8,10,12,15,20,18,10,8,10,15,20,25,22,15,8,5],
        k=1
       )[0]

      minute = random.randint(0,59)
      order_time = random_date.replace(hour=hour,minute=minute,second=0)

      restaurant = restaurants_df.sample(1).iloc[0]
      delivery_locality = random.choice(Mumbai_localities)

      distance_km = get_distance(restaurant['locality'], delivery_locality)

      partner = partners_df.sample(1).iloc[0]

    #weather conditions (mainly monsoon season)
      if order_time.month in [6,7,8,9]:
        weather = random.choices(
            Weather_conditions,
            weights=[20,30,25,20,5],
            k=1
        )[0]
      else:
        weather = random.choices(
            Weather_conditions,
            weights=[50,10,5,25,10],
            k=1
        )[0]
    #estimated delivery time
      prep_time = restaurant['avg_prep_time_min']
      travel_time_base = distance_km * 3 
      estimated_delivery_time = prep_time+travel_time_base
#buffer time
      estimated_delivery_time=int(estimated_delivery_time*1.1)

#actual delivery time
      actual_prep_time = prep_time+np.random.randint(-5,10)
      actual_travel_time = travel_time_base

#delay factors - rush hour
      if hour in [12,13,19,20,21]:
       actual_travel_time *= random.uniform(1.2,1.8)
      if weather == 'Heavy rains':
       actual_travel_time *= random.uniform(1.5,2.5)
      elif weather == 'Light rains':
       actual_travel_time *= random.uniform(1.1,1.4)
#weekend
      if order_time.weekday() in [5,6]:
        actual_travel_time *= random.uniform(0.9,1.0)
#partner experience
      if partner['experience_months'] < 6:
       actual_travel_time *= random.uniform(1.1,1.3)
      elif partner['experience_months'] > 24:
       actual_travel_time *= random.uniform(0.9,1.0)

#vehicle type
      if partner['vehicle_type'] == 'Bicycle' and distance_km > 5 :
       actual_travel_time += random.uniform(5,20)
      if random.random() < 0.15:  # 15% chance of random delay
        actual_travel_time += random.uniform(5, 20)
    
      actual_delivery_time = int(actual_prep_time + actual_travel_time)

#calculating time
      estimated_delivery_datetime = order_time +timedelta(minutes=estimated_delivery_time)
      actual_delivery_datetime = order_time +timedelta(minutes=actual_delivery_time)
      delay_minutes = (actual_delivery_datetime - estimated_delivery_datetime).total_seconds() / 60

      order_value = round(np.random.uniform(200,1500), 2)

      orders.append({
        'order_id': f'ORD_{i+1:06d}',
            'order_datetime': order_time,
            'day_of_week': order_time.strftime('%A'),
            'hour_of_day': hour,
            'is_weekend': order_time.weekday() in [5, 6],
            'is_peak_hour': hour in [12, 13, 19, 20, 21],
            'month': order_time.month,
            'restaurant_id': restaurant['restaurant_id'],
            'restaurant_locality': restaurant['locality'],
            'cuisine_type': restaurant['cuisine_type'],
            'delivery_locality': delivery_locality,
            'distance_km': distance_km,
            'partner_id': partner['partner_id'],
            'vehicle_type': partner['vehicle_type'],
            'partner_experience_months': partner['experience_months'],
            'weather_condition': weather,
            'order_value_inr': order_value,
            'estimated_prep_time_min': prep_time,
            'estimated_delivery_time_min': estimated_delivery_time,
            'actual_delivery_time_min': actual_delivery_time,
            'estimated_delivery_datetime': estimated_delivery_datetime,
            'actual_delivery_datetime': actual_delivery_datetime,
            'delay_minutes': round(delay_minutes, 2),
            'is_delayed': delay_minutes > 0,
            'delay_category': 'On Time' if delay_minutes <= 0 else 
                             'Slightly Delayed' if delay_minutes <= 10 else
                             'Moderately Delayed' if delay_minutes <= 20 else
                             'Severely Delayed'
        })
    return pd.DataFrame(orders)

In [18]:
print("Generating Mumbai food delivery dataset..")
print("=" * 60)

restuarants_df = generate_restaurants(150)
print(f"Generated {len(restuarants_df)} restuarants")

partners_df = generate_delivery_partners(300)
print(f"Generated {len(partners_df)} delivery partners")

orders_df = generate_orders(5000)
print(f"Generated {len(orders_df)} orders")

Generating Mumbai food delivery dataset..
Generated 150 restuarants
Generated 300 delivery partners
Generated 5000 orders


In [19]:
orders_df = generate_orders(5000)
print(f"Generated {len(orders_df)} orders")


Generated 5000 orders


In [21]:
print(restuarants_df.columns)

Index(['restaurant_id', 'restaurant_name', 'locality', 'cuisine_type',
       'avg_prep_time_min', 'restaurant_rating', 'is_premium'],
      dtype='object')


In [25]:
#save files
restuarants_df.to_csv('mumbai_restuarants.csv',index=False)
partners_df.to_csv('mumbai_delivery_partners.csv',index=False)
orders_df.to_csv('mumbai_food_orders.csv',index=False)

In [24]:
print("\n"+ "=" * 60)
print("Dataset Generation completed")
print("=" * 60)


Dataset Generation completed


In [27]:
import os 
print(os.getcwd())
print(os.listdir())

c:\Users\apurv\OneDrive\Desktop\food delivery project
['dataset.ipynb', 'mumbai_delivery_partners.csv', 'mumbai_food_orders.csv', 'mumbai_restuarants.csv']
