<a href="https://colab.research.google.com/github/abhinandan6123/AutonoPros/blob/Advanced-Predictive-Analytics-for-Real-Time-Ride-and-Delivery-Pricing/DataSet_Generator_PDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install faker # Install the faker library

import pandas as pd
import numpy as np
from faker import Faker # Now this line should work without error
import random



fake = Faker()

# Settings
num_records = 10000  # Number of records to generate

# Helper Functions
def generate_location():
    return fake.latitude(), fake.longitude()

def generate_weather():
    return random.choice(['Sunny', 'Rainy', 'Snowy', 'Cloudy', 'Windy'])

def generate_vehicle_type():
    return random.choice(['Standard', 'Premium', 'Luxury'])

def generate_delivery_type():
    return random.choice(['Standard', 'Express'])

def generate_time_of_day():
    hour = random.randint(0, 23)
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

def generate_day_of_week():
    return random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

def generate_surge_multiplier():
    return round(random.uniform(1, 3), 2) if random.random() < 0.2 else 1.0  # 20% chance of surge

# Generate Ride Pricing Data
ride_data = {
    'Trip_ID': range(1, num_records + 1),
    'Pickup_Latitude': [generate_location()[0] for _ in range(num_records)],
    'Pickup_Longitude': [generate_location()[1] for _ in range(num_records)],
    'Dropoff_Latitude': [generate_location()[0] for _ in range(num_records)],
    'Dropoff_Longitude': [generate_location()[1] for _ in range(num_records)],
    'Trip_Distance_km': np.round(np.random.uniform(1, 30, num_records), 2),
    'Trip_Duration_min': np.random.randint(5, 120, num_records),
    'Base_Fare': np.round(np.random.uniform(2.5, 5.0, num_records), 2),
    'Surge_Multiplier': [generate_surge_multiplier() for _ in range(num_records)],
    'Time_of_Day': [generate_time_of_day() for _ in range(num_records)],
    'Day_of_Week': [generate_day_of_week() for _ in range(num_records)],
    'Weather_Condition': [generate_weather() for _ in range(num_records)],
    'Vehicle_Type': [generate_vehicle_type() for _ in range(num_records)],
    'Traffic_Index': np.round(np.random.uniform(0.5, 2.0, num_records), 2),
    'Pickup_Location_Type': random.choices(['Airport', 'Urban', 'Suburban'], k=num_records),
    'Dropoff_Location_Type': random.choices(['Urban', 'Suburban'], k=num_records),
    'Tolls': np.round(np.random.uniform(0, 10, num_records), 2),
    'Cancellation_Fee': np.where(np.random.rand(num_records) < 0.05, np.round(np.random.uniform(5, 20, num_records), 2), 0),
    'Passenger_Count': np.random.randint(1, 5, num_records)
}

ride_df = pd.DataFrame(ride_data)
ride_df['Total_Fare'] = (ride_df['Base_Fare'] +
                          (ride_df['Trip_Distance_km'] * 1.5) +  # Per km rate
                          (ride_df['Trip_Duration_min'] * 0.5) +  # Per minute rate
                          ride_df['Tolls'] +
                          ride_df['Cancellation_Fee']) * ride_df['Surge_Multiplier']

# Generate Delivery Pricing Data
delivery_data = {
    'Delivery_ID': range(1, num_records + 1),
    'Pickup_Latitude': [generate_location()[0] for _ in range(num_records)],
    'Pickup_Longitude': [generate_location()[1] for _ in range(num_records)],
    'Dropoff_Latitude': [generate_location()[0] for _ in range(num_records)],
    'Dropoff_Longitude': [generate_location()[1] for _ in range(num_records)],
    'Delivery_Distance_km': np.round(np.random.uniform(1, 50, num_records), 2),
    'Package_Weight_kg': np.round(np.random.uniform(0.1, 20, num_records), 2),
    'Delivery_Time_Est_min': np.random.randint(10, 240, num_records),
    'Order_Value': np.round(np.random.uniform(5, 500, num_records), 2),
    'Platform_Fees': np.round(np.random.uniform(0.5, 5.0, num_records), 2),
    'Time_of_Day': [generate_time_of_day() for _ in range(num_records)],
    'Day_of_Week': [generate_day_of_week() for _ in range(num_records)],
    'Weather_Condition': [generate_weather() for _ in range(num_records)],
    'Delivery_Type': [generate_delivery_type() for _ in range(num_records)],
    'Traffic_Index': np.round(np.random.uniform(0.5, 2.0, num_records), 2),
    'Delivery_Location_Type': random.choices(['Urban', 'Suburban', 'Remote'], k=num_records),
    'Discount_Applied': np.where(np.random.rand(num_records) < 0.3, np.round(np.random.uniform(1, 20, num_records), 2), 0),
    'Surge_Multiplier': [generate_surge_multiplier() for _ in range(num_records)],
    'Fuel_Cost': np.round(np.random.uniform(1, 10, num_records), 2)
}

delivery_df = pd.DataFrame(delivery_data)
delivery_df['Total_Delivery_Fee'] = ((delivery_df['Delivery_Distance_km'] * 0.8) +  # Per km rate
                                     (delivery_df['Package_Weight_kg'] * 0.5) +  # Per kg rate
                                     delivery_df['Platform_Fees'] +
                                     delivery_df['Fuel_Cost']) * delivery_df['Surge_Multiplier'] - delivery_df['Discount_Applied']

# Combine into a single Excel file with two sheets
with pd.ExcelWriter('Predictive_Analytics_Dataset.xlsx') as writer:
    ride_df.to_excel(writer, sheet_name='Ride_Pricing', index=False)
    delivery_df.to_excel(writer, sheet_name='Delivery_Pricing', index=False)

print("Synthetic dataset 'Predictive_Analytics_Dataset.xlsx' has been generated.")


Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m78.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0
Synthetic dataset 'Predictive_Analytics_Dataset.xlsx' has been generated.


In [4]:
from google.colab import files
files.download('Predictive_Analytics_Dataset.xlsx')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>