In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Number of samples to generate
n_samples = 1000

In [9]:

# Generate service types (auto, bike)
service_types = np.random.choice(['Auto', 'Bike'], n_samples, p=[0.7, 0.3])

In [4]:
# Generate driver IDs (50 unique drivers)
driver_ids = [f"D{str(i).zfill(3)}" for i in range(1, 51)]

In [10]:
# Generate data
data = {
    'driver_id': np.random.choice(driver_ids, n_samples),
    'timestamp': [(datetime(2024, 1, 1) + timedelta(hours=np.random.randint(0, 24*30))).strftime("%Y-%m-%d %H:%M:%S") for _ in range(n_samples)],
    'service_type': service_types,
    'historical_acceptance_rate': np.random.beta(7, 3, n_samples),
    'distance_to_pickup_km': np.random.exponential(2, n_samples),
    'estimated_trip_distance_km': np.random.lognormal(2, 0.5, n_samples),
    'estimated_trip_time_min': np.zeros(n_samples),  # Will calculate based on distance and traffic
    'weather_condition': np.random.choice(['Clear', 'Cloudy', 'Light Rain', 'Heavy Rain', 'Snow'], n_samples, p=[0.5, 0.2, 0.15, 0.1, 0.05]),
    'traffic_congestion_level': np.random.choice(['Low', 'Medium', 'High', 'Severe'], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
    'is_peak_hour': np.zeros(n_samples, dtype=bool),  # Will fill based on time
    'hours_already_worked': np.random.gamma(3, 1, n_samples),
    'accepted_ride': np.zeros(n_samples, dtype=bool)  # Will fill this based on other features
}

In [11]:
# Create DataFrame
df = pd.DataFrame(data)

In [12]:

# Extract time features
df['hour_of_day'] = df['timestamp'].apply(lambda x: int(x.split()[1].split(':')[0]))
df['day_of_week'] = df['timestamp'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday())


In [13]:
# Determine peak hours (weekdays 7-10 AM and 5-8 PM, weekends 6-9 PM)
def is_peak_hour(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    
    if day < 5:  # Weekday
        return (7 <= hour <= 10) or (17 <= hour <= 20)
    else:  # Weekend
        return 18 <= hour <= 21

df['is_peak_hour'] = df.apply(is_peak_hour, axis=1)


In [14]:
# Calculate estimated trip time based on distance and traffic
def estimate_trip_time(row):
    distance = row['estimated_trip_distance_km']
    
    # Base speed in km/min (converted from km/h)
    if row['traffic_congestion_level'] == 'Low':
        speed = 0.5  # 30 km/h
    elif row['traffic_congestion_level'] == 'Medium':
        speed = 0.4  # 24 km/h
    elif row['traffic_congestion_level'] == 'High':
        speed = 0.3  # 18 km/h
    else:  # Severe
        speed = 0.2  # 12 km/h
    
    # Adjust speed based on weather
    if row['weather_condition'] in ['Heavy Rain', 'Snow']:
        speed *= 0.8
    elif row['weather_condition'] == 'Light Rain':
        speed *= 0.9
    
    # Adjust speed based on service type (bikes are faster in traffic)
    if row['service_type'] == 'Bike' and row['traffic_congestion_level'] in ['High', 'Severe']:
        speed *= 1.3
    
    # Calculate time with a minimum of 5 minutes
    return max(5, distance / speed)

df['estimated_trip_time_min'] = df.apply(estimate_trip_time, axis=1)


In [15]:

# Calculate fare amount based on Namma Yatri pricing strategy
def calculate_namma_yatri_fare(row):
    distance = row['estimated_trip_distance_km']
    time = row['estimated_trip_time_min']
    service = row['service_type']
    
    # Set pricing based on service type for Namma Yatri
    if service == 'Auto':
        base_fare = np.random.uniform(30, 40)
        per_km_rate = np.random.uniform(12, 15)
        per_min_rate = np.random.uniform(1, 2)
    else:  # Bike
        base_fare = 30
        per_km_rate = np.random.uniform(10, 12)
        per_min_rate = 1
    
    # Calculate fare components
    distance_fare = per_km_rate * distance
    time_fare = per_min_rate * time
    
    # Total fare (Namma Yatri has no surge)
    total_fare = base_fare + distance_fare + time_fare
    
    # Round to nearest 5 rupees
    return round(total_fare / 5) * 5


In [18]:
df['fare_amount'] = df.apply(calculate_namma_yatri_fare, axis=1)

In [19]:
# For Namma Yatri, driver earnings equal fare amount (0% commission)
df['driver_earnings'] = df['fare_amount']

In [20]:
# Define acceptance probability based on features
def calculate_acceptance_probability(row):
    prob = 0.8  # Base probability
    
    # Adjust based on historical acceptance rate (strong factor)
    prob += 0.15 * row['historical_acceptance_rate']
    
    # Adjust based on distance to pickup (negative factor)
    prob -= 0.05 * min(row['distance_to_pickup_km'], 10) / 2
    
    # Adjust based on earnings (positive factor)
    earnings_factor = min(row['driver_earnings'] / 100, 1)  # Cap at 1
    prob += 0.15 * earnings_factor
    
    # Adjust based on trip distance (slight negative for very long trips)
    if row['estimated_trip_distance_km'] > 20:
        prob -= 0.05
    
    # Adjust based on weather (negative for bad weather)
    if row['weather_condition'] == 'Heavy Rain':
        prob -= 0.1
    elif row['weather_condition'] == 'Snow':
        prob -= 0.15
    
    # Adjust based on traffic (negative for high traffic)
    if row['traffic_congestion_level'] == 'High':
        prob -= 0.05
    elif row['traffic_congestion_level'] == 'Severe':
        prob -= 0.1
    
    # Adjust based on hours already worked (negative if worked many hours)
    if row['hours_already_worked'] > 8:
        prob -= 0.2 * min((row['hours_already_worked'] - 8) / 4, 1)
    
    # Adjust based on time of day (higher during peak hours, though no surge pricing)
    if row['is_peak_hour']:
        prob += 0.05
    elif row['hour_of_day'] in [2, 3, 4, 5]:  # Late night/early morning
        prob -= 0.1
    
    # Adjust based on day of week (higher on weekends)
    if row['day_of_week'] >= 5:  # Weekend
        prob += 0.05
    
    # Ensure probability is between 0 and 1
    return max(0.01, min(0.99, prob))


In [21]:

# Calculate acceptance probability and determine acceptance
df['acceptance_probability'] = df.apply(calculate_acceptance_probability, axis=1)
df['accepted_ride'] = df['acceptance_probability'].apply(lambda x: np.random.random() < x)

# Convert boolean to int for easier analysis
df['accepted_ride'] = df['accepted_ride'].astype(int)

In [22]:
# Reorder columns for clarity
column_order = [
    'driver_id', 'timestamp', 'hour_of_day', 'day_of_week', 'is_peak_hour',
    'service_type', 'historical_acceptance_rate', 
    'distance_to_pickup_km', 'estimated_trip_distance_km', 
    'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
    'weather_condition', 'traffic_congestion_level', 'hours_already_worked', 
    'acceptance_probability', 'accepted_ride'
]

In [23]:
df = df[column_order]

In [25]:
print(df.head())

  driver_id            timestamp  hour_of_day  day_of_week  is_peak_hour  \
0      D027  2024-01-09 09:00:00            9            1          True   
1      D022  2024-01-08 06:00:00            6            0         False   
2      D037  2024-01-11 03:00:00            3            3         False   
3      D019  2024-01-12 04:00:00            4            4         False   
4      D039  2024-01-19 13:00:00           13            4         False   

  service_type  historical_acceptance_rate  distance_to_pickup_km  \
0         Auto                    0.829146               0.593980   
1         Bike                    0.820821               1.868333   
2         Bike                    0.737274               3.407446   
3         Auto                    0.830282               0.196544   
4         Auto                    0.843371               0.645289   

   estimated_trip_distance_km  estimated_trip_time_min  fare_amount  \
0                    3.136819                 7.842048   

In [26]:

# Save to CSV
df.to_csv('namma_yatri_driver_acceptance_dataset.csv', index=False)


#### Generate more!!

In [27]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(124)  # Different seed for more variation

# Number of samples to generate
n_samples = 2500  # Increased sample size

# Generate driver IDs (100 unique drivers)
driver_ids = [f"D{str(i).zfill(3)}" for i in range(1, 101)]  # More unique drivers

# Generate service types (auto, bike) for Namma Yatri
service_types = np.random.choice(['Auto', 'Bike'], n_samples, p=[0.7, 0.3])

# Generate data with different time period (May-August 2024)
data = {
    'driver_id': np.random.choice(driver_ids, n_samples),
    'timestamp': [(datetime(2024, 5, 1) + timedelta(days=np.random.randint(0, 120), 
                                                   hours=np.random.randint(0, 24),
                                                   minutes=np.random.randint(0, 60))).strftime("%Y-%m-%d %H:%M:%S") 
                 for _ in range(n_samples)],
    'service_type': service_types,
    'historical_acceptance_rate': np.random.beta(7, 3, n_samples),
    'distance_to_pickup_km': np.random.exponential(2, n_samples),
    'estimated_trip_distance_km': np.random.lognormal(2, 0.5, n_samples),
    'estimated_trip_time_min': np.zeros(n_samples),  # Will calculate based on distance and traffic
    'weather_condition': np.zeros(n_samples, dtype=object),  # Will fill based on month and random factors
    'traffic_congestion_level': np.zeros(n_samples, dtype=object),  # Will fill based on time and day
    'is_peak_hour': np.zeros(n_samples, dtype=bool),  # Will fill based on time
    'hours_already_worked': np.random.gamma(3, 1, n_samples),
    'accepted_ride': np.zeros(n_samples, dtype=bool)  # Will fill this based on other features
}

# Create DataFrame
df = pd.DataFrame(data)

# Extract time features
df['datetime'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day

# Generate seasonal weather patterns
def generate_weather(row):
    month = row['month']
    random_factor = np.random.random()
    
    # May-June: Mostly clear, some light rain
    if month in [5, 6]:
        if random_factor < 0.6:
            return 'Clear'
        elif random_factor < 0.8:
            return 'Cloudy'
        else:
            return 'Light Rain'
    
    # July-August: Monsoon season, more rain
    elif month in [7, 8]:
        if random_factor < 0.3:
            return 'Clear'
        elif random_factor < 0.5:
            return 'Cloudy'
        elif random_factor < 0.8:
            return 'Light Rain'
        else:
            return 'Heavy Rain'
    
    return 'Clear'  # Default

df['weather_condition'] = df.apply(generate_weather, axis=1)

# Generate traffic patterns based on time and day
def generate_traffic(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    random_factor = np.random.random()
    
    # Rush hour weekday traffic
    if day < 5:  # Weekday
        if (8 <= hour <= 10) or (17 <= hour <= 19):  # Morning/evening rush
            if random_factor < 0.4:
                return 'High'
            elif random_factor < 0.7:
                return 'Severe'
            else:
                return 'Medium'
        elif (7 <= hour <= 11) or (16 <= hour <= 20):  # Extended rush periods
            if random_factor < 0.4:
                return 'Medium'
            elif random_factor < 0.7:
                return 'High'
            else:
                return 'Low'
    
    # Weekend traffic patterns
    if day >= 5:  # Weekend
        if 11 <= hour <= 20:  # Daytime shopping/leisure
            if random_factor < 0.5:
                return 'Medium'
            elif random_factor < 0.8:
                return 'High'
            else:
                return 'Low'
    
    # Late night traffic usually light
    if 22 <= hour or hour <= 5:
        if random_factor < 0.8:
            return 'Low'
        else:
            return 'Medium'
    
    # Default times - mixed
    if random_factor < 0.4:
        return 'Low'
    elif random_factor < 0.8:
        return 'Medium'
    else:
        return 'High'

df['traffic_congestion_level'] = df.apply(generate_traffic, axis=1)

# Determine peak hours (weekdays 7-10 AM and 5-8 PM, weekends 11 AM-8 PM)
def is_peak_hour(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    
    if day < 5:  # Weekday
        return (7 <= hour <= 10) or (17 <= hour <= 20)
    else:  # Weekend
        return 11 <= hour <= 20

df['is_peak_hour'] = df.apply(is_peak_hour, axis=1)

# Calculate estimated trip time based on distance and traffic
def estimate_trip_time(row):
    distance = row['estimated_trip_distance_km']
    
    # Base speed in km/min (converted from km/h)
    if row['traffic_congestion_level'] == 'Low':
        speed = 0.5  # 30 km/h
    elif row['traffic_congestion_level'] == 'Medium':
        speed = 0.4  # 24 km/h
    elif row['traffic_congestion_level'] == 'High':
        speed = 0.3  # 18 km/h
    else:  # Severe
        speed = 0.2  # 12 km/h
    
    # Adjust speed based on weather
    if row['weather_condition'] == 'Heavy Rain':
        speed *= 0.8
    elif row['weather_condition'] == 'Light Rain':
        speed *= 0.9
    
    # Adjust speed based on service type (bikes are faster in traffic)
    if row['service_type'] == 'Bike' and row['traffic_congestion_level'] in ['High', 'Severe']:
        speed *= 1.3
    
    # Calculate time with a minimum of 5 minutes
    return max(5, distance / speed)

df['estimated_trip_time_min'] = df.apply(estimate_trip_time, axis=1)

# Calculate fare amount based on Namma Yatri pricing strategy with some seasonal adjustments
def calculate_namma_yatri_fare(row):
    distance = row['estimated_trip_distance_km']
    time = row['estimated_trip_time_min']
    service = row['service_type']
    month = row['month']
    
    # Seasonal base price variations
    month_factor = 1.0
    if month in [6, 7]:  # Slight increase in rainy season
        month_factor = 1.05
    
    # Set pricing based on service type for Namma Yatri
    if service == 'Auto':
        base_fare = np.random.uniform(30, 40) * month_factor
        per_km_rate = np.random.uniform(12, 15) * month_factor
        per_min_rate = np.random.uniform(1, 2) * month_factor
    else:  # Bike
        base_fare = 30 * month_factor
        per_km_rate = np.random.uniform(10, 12) * month_factor
        per_min_rate = 1 * month_factor
    
    # Calculate fare components
    distance_fare = per_km_rate * distance
    time_fare = per_min_rate * time
    
    # Add a small random factor (±5%) to simulate price variations
    random_factor = np.random.uniform(0.95, 1.05)
    
    # Total fare (Namma Yatri has no surge)
    total_fare = (base_fare + distance_fare + time_fare) * random_factor
    
    # Round to nearest 5 rupees
    return round(total_fare / 5) * 5

df['fare_amount'] = df.apply(calculate_namma_yatri_fare, axis=1)

# For Namma Yatri, driver earnings equal fare amount (0% commission)
df['driver_earnings'] = df['fare_amount']

# Generate driver data with more variation
# Create a dictionary to track driver stats
driver_stats = {}
for driver_id in driver_ids:
    # Generate varied driver characteristics
    driver_stats[driver_id] = {
        'experience_months': np.random.randint(1, 36),  # 1 month to 3 years experience
        'vehicle_quality': np.random.choice(['Low', 'Medium', 'High'], p=[0.2, 0.6, 0.2]),
        'preferred_areas': np.random.choice(['Urban', 'Suburban', 'Mixed'], p=[0.4, 0.3, 0.3]),
    }

# Add driver characteristics to main dataframe
df['driver_experience_months'] = df['driver_id'].map(lambda x: driver_stats[x]['experience_months'])
df['vehicle_quality'] = df['driver_id'].map(lambda x: driver_stats[x]['vehicle_quality'])
df['preferred_areas'] = df['driver_id'].map(lambda x: driver_stats[x]['preferred_areas'])

# Define acceptance probability based on features with more complexity
def calculate_acceptance_probability(row):
    prob = 0.75  # Base probability
    
    # Adjust based on historical acceptance rate (strong factor)
    prob += 0.15 * row['historical_acceptance_rate']
    
    # Adjust based on distance to pickup (negative factor)
    prob -= 0.05 * min(row['distance_to_pickup_km'], 10) / 2
    
    # Adjust based on earnings (positive factor)
    earnings_factor = min(row['driver_earnings'] / 100, 1)  # Cap at 1
    prob += 0.15 * earnings_factor
    
    # Adjust based on trip distance (slight negative for very long trips)
    if row['estimated_trip_distance_km'] > 20:
        prob -= 0.05
    elif 5 <= row['estimated_trip_distance_km'] <= 15:
        prob += 0.03  # Slight preference for medium-distance trips
    
    # Adjust based on weather (negative for bad weather)
    if row['weather_condition'] == 'Heavy Rain':
        prob -= 0.15
    elif row['weather_condition'] == 'Light Rain':
        prob -= 0.05
    
    # Adjust based on traffic (negative for high traffic)
    if row['traffic_congestion_level'] == 'High':
        prob -= 0.05
    elif row['traffic_congestion_level'] == 'Severe':
        prob -= 0.12
    
    # Adjust based on hours already worked (negative if worked many hours)
    if row['hours_already_worked'] > 8:
        prob -= 0.2 * min((row['hours_already_worked'] - 8) / 4, 1)
    
    # Adjust based on time of day
    if row['is_peak_hour']:
        prob += 0.05  # Higher during peak hours due to more ride opportunities
    elif row['hour_of_day'] in [2, 3, 4, 5]:  # Late night/early morning
        prob -= 0.1
    
    # Adjust based on day of week
    if row['day_of_week'] >= 5:  # Weekend
        prob += 0.05
    
    # Driver experience factors
    if row['driver_experience_months'] > 24:
        prob += 0.05  # Experienced drivers more likely to accept varied rides
    elif row['driver_experience_months'] < 6:
        prob -= 0.05  # New drivers may be more selective
    
    # Vehicle quality factor
    if row['vehicle_quality'] == 'Low' and row['estimated_trip_distance_km'] > 15:
        prob -= 0.05  # Lower quality vehicles less likely to take long trips
    
    # Ensure probability is between 0 and 1
    return max(0.01, min(0.99, prob))

# Calculate acceptance probability and determine acceptance
df['acceptance_probability'] = df.apply(calculate_acceptance_probability, axis=1)
df['accepted_ride'] = df['acceptance_probability'].apply(lambda x: np.random.random() < x)

# Convert boolean to int for easier analysis
df['accepted_ride'] = df['accepted_ride'].astype(int)

# Reorder columns for clarity
column_order = [
    'driver_id', 'timestamp', 'month', 'day', 'hour_of_day', 'day_of_week', 'is_peak_hour',
    'service_type', 'driver_experience_months', 'vehicle_quality', 'preferred_areas',
    'historical_acceptance_rate', 'distance_to_pickup_km', 'estimated_trip_distance_km', 
    'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
    'weather_condition', 'traffic_congestion_level', 'hours_already_worked', 
    'acceptance_probability', 'accepted_ride'
]
df = df[column_order]

# Print sample of the data
print(df.head(10))
print("\nDataset summary statistics:")
print(df.describe())

# Save to CSV
df.to_csv('namma_yatri_extended_dataset.csv', index=False)
print("\nDataset saved to 'namma_yatri_extended_dataset.csv'")

  driver_id            timestamp  month  day  hour_of_day  day_of_week  \
0      D016  2024-08-26 04:34:00      8   26            4            0   
1      D100  2024-08-10 06:32:00      8   10            6            5   
2      D100  2024-06-30 19:38:00      6   30           19            6   
3      D012  2024-06-12 08:26:00      6   12            8            2   
4      D062  2024-07-15 13:31:00      7   15           13            0   
5      D089  2024-05-28 19:12:00      5   28           19            1   
6      D008  2024-06-04 10:04:00      6    4           10            1   
7      D044  2024-08-25 20:31:00      8   25           20            6   
8      D039  2024-05-06 05:05:00      5    6            5            0   
9      D048  2024-05-10 05:55:00      5   10            5            4   

   is_peak_hour service_type  driver_experience_months vehicle_quality  ...  \
0         False         Auto                         7          Medium  ...   
1         False         Bik

combine the 2 datasets

In [28]:
# Load both CSV files
df1 = pd.read_csv("namma_yatri_extended_dataset.csv")
df2 = pd.read_csv("namma_yatri_driver_acceptance_dataset.csv")

# Combine the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the merged dataset
merged_df.to_csv("namma_yatri_merged_dataset.csv", index=False)

print("Merged dataset saved as namma_yatri_merged_dataset.csv ✅")


Merged dataset saved as namma_yatri_merged_dataset.csv ✅


#### Generate more!!

In [29]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(124)  # Different seed for more variation

# Number of samples to generate
n_samples = 50000  # Increased sample size

# Generate driver IDs (100 unique drivers)
driver_ids = [f"D{str(i).zfill(3)}" for i in range(1, 1001)]  # More unique drivers

# Generate service types (auto, bike) for Namma Yatri
service_types = np.random.choice(['Auto', 'Bike'], n_samples, p=[0.7, 0.3])

# Generate timestamps efficiently
start_date = datetime(2024, 5, 1)
end_date = datetime(2024, 8, 31)
date_range = (end_date - start_date).total_seconds()

timestamps = [start_date + timedelta(seconds=np.random.uniform(0, date_range)) for _ in range(n_samples)]


# Generate data with different time period (May-August 2024)
data = {
    'driver_id': np.random.choice(driver_ids, n_samples),
    'timestamp': [ts.strftime("%Y-%m-%d %H:%M:%S") for ts in timestamps],
    'service_type': service_types,
    'historical_acceptance_rate': np.random.beta(7, 3, n_samples),
    'distance_to_pickup_km': np.random.exponential(2, n_samples),
    'estimated_trip_distance_km': np.random.lognormal(2, 0.5, n_samples),
    'estimated_trip_time_min': np.zeros(n_samples),  # Will calculate based on distance and traffic
    'weather_condition': np.zeros(n_samples, dtype=object),  # Will fill based on month and random factors
    'traffic_congestion_level': np.zeros(n_samples, dtype=object),  # Will fill based on time and day
    'is_peak_hour': np.zeros(n_samples, dtype=bool),  # Will fill based on time
    'hours_already_worked': np.random.gamma(3, 1, n_samples),
    'accepted_ride': np.zeros(n_samples, dtype=bool)  # Will fill this based on other features
}

# Create DataFrame
df = pd.DataFrame(data)

# Extract time features
df['datetime'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day

# Generate seasonal weather patterns
def generate_weather(row):
    month = row['month']
    random_factor = np.random.random()
    
    # May-June: Mostly clear, some light rain
    if month in [5, 6]:
        if random_factor < 0.6:
            return 'Clear'
        elif random_factor < 0.8:
            return 'Cloudy'
        else:
            return 'Light Rain'
    
    # July-August: Monsoon season, more rain
    elif month in [7, 8]:
        if random_factor < 0.3:
            return 'Clear'
        elif random_factor < 0.5:
            return 'Cloudy'
        elif random_factor < 0.8:
            return 'Light Rain'
        else:
            return 'Heavy Rain'
    
    return 'Clear'  # Default

df['weather_condition'] = df.apply(generate_weather, axis=1)

# Generate traffic patterns based on time and day
def generate_traffic(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    random_factor = np.random.random()
    
    # Rush hour weekday traffic
    if day < 5:  # Weekday
        if (8 <= hour <= 10) or (17 <= hour <= 19):  # Morning/evening rush
            if random_factor < 0.4:
                return 'High'
            elif random_factor < 0.7:
                return 'Severe'
            else:
                return 'Medium'
        elif (7 <= hour <= 11) or (16 <= hour <= 20):  # Extended rush periods
            if random_factor < 0.4:
                return 'Medium'
            elif random_factor < 0.7:
                return 'High'
            else:
                return 'Low'
    
    # Weekend traffic patterns
    if day >= 5:  # Weekend
        if 11 <= hour <= 20:  # Daytime shopping/leisure
            if random_factor < 0.5:
                return 'Medium'
            elif random_factor < 0.8:
                return 'High'
            else:
                return 'Low'
    
    # Late night traffic usually light
    if 22 <= hour or hour <= 5:
        if random_factor < 0.8:
            return 'Low'
        else:
            return 'Medium'
    
    # Default times - mixed
    if random_factor < 0.4:
        return 'Low'
    elif random_factor < 0.8:
        return 'Medium'
    else:
        return 'High'

df['traffic_congestion_level'] = df.apply(generate_traffic, axis=1)

# Determine peak hours (weekdays 7-10 AM and 5-8 PM, weekends 11 AM-8 PM)
def is_peak_hour(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    
    if day < 5:  # Weekday
        return (7 <= hour <= 10) or (17 <= hour <= 20)
    else:  # Weekend
        return 11 <= hour <= 20

df['is_peak_hour'] = df.apply(is_peak_hour, axis=1)

# Calculate estimated trip time based on distance and traffic
def estimate_trip_time(row):
    distance = row['estimated_trip_distance_km']
    
    # Base speed in km/min (converted from km/h)
    if row['traffic_congestion_level'] == 'Low':
        speed = 0.5  # 30 km/h
    elif row['traffic_congestion_level'] == 'Medium':
        speed = 0.4  # 24 km/h
    elif row['traffic_congestion_level'] == 'High':
        speed = 0.3  # 18 km/h
    else:  # Severe
        speed = 0.2  # 12 km/h
    
    # Adjust speed based on weather
    if row['weather_condition'] == 'Heavy Rain':
        speed *= 0.8
    elif row['weather_condition'] == 'Light Rain':
        speed *= 0.9
    
    # Adjust speed based on service type (bikes are faster in traffic)
    if row['service_type'] == 'Bike' and row['traffic_congestion_level'] in ['High', 'Severe']:
        speed *= 1.3
    
    # Calculate time with a minimum of 5 minutes
    return max(5, distance / speed)

df['estimated_trip_time_min'] = df.apply(estimate_trip_time, axis=1)

# Calculate fare amount based on Namma Yatri pricing strategy with some seasonal adjustments
def calculate_namma_yatri_fare(row):
    distance = row['estimated_trip_distance_km']
    time = row['estimated_trip_time_min']
    service = row['service_type']
    month = row['month']
    
    # Seasonal base price variations
    month_factor = 1.0
    if month in [6, 7]:  # Slight increase in rainy season
        month_factor = 1.05
    
    # Set pricing based on service type for Namma Yatri
    if service == 'Auto':
        base_fare = np.random.uniform(30, 40) * month_factor
        per_km_rate = np.random.uniform(12, 15) * month_factor
        per_min_rate = np.random.uniform(1, 2) * month_factor
    else:  # Bike
        base_fare = 30 * month_factor
        per_km_rate = np.random.uniform(10, 12) * month_factor
        per_min_rate = 1 * month_factor
    
    # Calculate fare components
    distance_fare = per_km_rate * distance
    time_fare = per_min_rate * time
    
    # Add a small random factor (±5%) to simulate price variations
    random_factor = np.random.uniform(0.95, 1.05)
    
    # Total fare (Namma Yatri has no surge)
    total_fare = (base_fare + distance_fare + time_fare) * random_factor
    
    # Round to nearest 5 rupees
    return round(total_fare / 5) * 5

df['fare_amount'] = df.apply(calculate_namma_yatri_fare, axis=1)

# For Namma Yatri, driver earnings equal fare amount (0% commission)
df['driver_earnings'] = df['fare_amount']

# Generate driver data with more variation
# Create a dictionary to track driver stats
driver_stats = {}
for driver_id in driver_ids:
    # Generate varied driver characteristics
    driver_stats[driver_id] = {
        'experience_months': np.random.randint(1, 36),  # 1 month to 3 years experience
        'vehicle_quality': np.random.choice(['Low', 'Medium', 'High'], p=[0.2, 0.6, 0.2]),
        'preferred_areas': np.random.choice(['Urban', 'Suburban', 'Mixed'], p=[0.4, 0.3, 0.3]),
    }

# Add driver characteristics to main dataframe
df['driver_experience_months'] = df['driver_id'].map(lambda x: driver_stats[x]['experience_months'])
df['vehicle_quality'] = df['driver_id'].map(lambda x: driver_stats[x]['vehicle_quality'])
df['preferred_areas'] = df['driver_id'].map(lambda x: driver_stats[x]['preferred_areas'])

# Define acceptance probability based on features with more complexity
def calculate_acceptance_probability(row):
    prob = 0.75  # Base probability
    
    # Adjust based on historical acceptance rate (strong factor)
    prob += 0.15 * row['historical_acceptance_rate']
    
    # Adjust based on distance to pickup (negative factor)
    prob -= 0.05 * min(row['distance_to_pickup_km'], 10) / 2
    
    # Adjust based on earnings (positive factor)
    earnings_factor = min(row['driver_earnings'] / 100, 1)  # Cap at 1
    prob += 0.15 * earnings_factor
    
    # Adjust based on trip distance (slight negative for very long trips)
    if row['estimated_trip_distance_km'] > 20:
        prob -= 0.05
    elif 5 <= row['estimated_trip_distance_km'] <= 15:
        prob += 0.03  # Slight preference for medium-distance trips
    
    # Adjust based on weather (negative for bad weather)
    if row['weather_condition'] == 'Heavy Rain':
        prob -= 0.15
    elif row['weather_condition'] == 'Light Rain':
        prob -= 0.05
    
    # Adjust based on traffic (negative for high traffic)
    if row['traffic_congestion_level'] == 'High':
        prob -= 0.05
    elif row['traffic_congestion_level'] == 'Severe':
        prob -= 0.12
    
    # Adjust based on hours already worked (negative if worked many hours)
    if row['hours_already_worked'] > 8:
        prob -= 0.2 * min((row['hours_already_worked'] - 8) / 4, 1)
    
    # Adjust based on time of day
    if row['is_peak_hour']:
        prob += 0.05  # Higher during peak hours due to more ride opportunities
    elif row['hour_of_day'] in [2, 3, 4, 5]:  # Late night/early morning
        prob -= 0.1
    
    # Adjust based on day of week
    if row['day_of_week'] >= 5:  # Weekend
        prob += 0.05
    
    # Driver experience factors
    if row['driver_experience_months'] > 24:
        prob += 0.05  # Experienced drivers more likely to accept varied rides
    elif row['driver_experience_months'] < 6:
        prob -= 0.05  # New drivers may be more selective
    
    # Vehicle quality factor
    if row['vehicle_quality'] == 'Low' and row['estimated_trip_distance_km'] > 15:
        prob -= 0.05  # Lower quality vehicles less likely to take long trips
    
    # Ensure probability is between 0 and 1
    return max(0.01, min(0.99, prob))

# Calculate acceptance probability and determine acceptance
df['acceptance_probability'] = df.apply(calculate_acceptance_probability, axis=1)
df['accepted_ride'] = df['acceptance_probability'].apply(lambda x: np.random.random() < x)

# Convert boolean to int for easier analysis
df['accepted_ride'] = df['accepted_ride'].astype(int)

# Reorder columns for clarity
column_order = [
    'driver_id', 'timestamp', 'month', 'day', 'hour_of_day', 'day_of_week', 'is_peak_hour',
    'service_type', 'driver_experience_months', 'vehicle_quality', 'preferred_areas',
    'historical_acceptance_rate', 'distance_to_pickup_km', 'estimated_trip_distance_km', 
    'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
    'weather_condition', 'traffic_congestion_level', 'hours_already_worked', 
    'acceptance_probability', 'accepted_ride'
]
df = df[column_order]

# Print sample of the data
print(df.head(10))
print("\nDataset summary statistics:")
print(df.describe())

# Save to CSV
df.to_csv('namma_yatri_huhu_dataset.csv', index=False)
print("\nDataset saved to 'namma_yatri_huhu_dataset.csv'")

  driver_id            timestamp  month  day  hour_of_day  day_of_week  \
0      D355  2024-06-25 11:05:52      6   25           11            1   
1      D913  2024-07-31 20:09:15      7   31           20            2   
2      D490  2024-08-03 06:52:02      8    3            6            5   
3      D568  2024-05-06 11:07:43      5    6           11            0   
4      D447  2024-05-17 20:37:06      5   17           20            4   
5      D898  2024-07-24 18:54:31      7   24           18            2   
6      D972  2024-07-21 03:20:34      7   21            3            6   
7      D923  2024-06-02 15:45:42      6    2           15            6   
8      D357  2024-08-23 02:04:06      8   23            2            4   
9      D400  2024-07-01 17:49:48      7    1           17            0   

   is_peak_hour service_type  driver_experience_months vehicle_quality  ...  \
0         False         Auto                        19            High  ...   
1          True         Bik

#### Merge with the new dataset

In [30]:
# Load both CSV files
df1 = pd.read_csv("namma_yatri_huhu_dataset.csv")
df2 = pd.read_csv("namma_yatri_merged_dataset.csv")

# Combine the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the merged dataset
merged_df.to_csv("namma_yatri_combined.csv", index=False)

print("Merged dataset saved as namma_yatri_combined.csv ✅")


Merged dataset saved as namma_yatri_combined.csv ✅


In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(124)  # Different seed for more variation

# Number of samples to generate
n_samples = 50000  # Increased sample size

# Generate driver IDs (1000 unique drivers)
driver_ids = [f"D{str(i).zfill(3)}" for i in range(1, 1001)]  # More unique drivers

# Generate service types (auto, bike) for Namma Yatri
service_types = np.random.choice(['Auto', 'Bike'], n_samples, p=[0.7, 0.3])

# Generate timestamps efficiently
start_date = datetime(2024, 5, 1)
end_date = datetime(2024, 8, 31)
date_range = (end_date - start_date).total_seconds()

timestamps = [start_date + timedelta(seconds=np.random.uniform(0, date_range)) for _ in range(n_samples)]

# Generate data with different time period (May-August 2024)
data = {
    'driver_id': np.random.choice(driver_ids, n_samples),
    'timestamp': [ts.strftime("%Y-%m-%d %H:%M:%S") for ts in timestamps],
    'service_type': service_types,
    'historical_acceptance_rate': np.random.beta(7, 3, n_samples),
    'distance_to_pickup_km': np.random.exponential(2, n_samples),
    'estimated_trip_distance_km': np.random.lognormal(2, 0.5, n_samples),
    'estimated_trip_time_min': np.zeros(n_samples),  # Will calculate based on distance and traffic
    'weather_condition': np.zeros(n_samples, dtype=object),  # Will fill based on month and random factors
    'traffic_congestion_level': np.zeros(n_samples, dtype=object),  # Will fill based on time and day
    'is_peak_hour': np.zeros(n_samples, dtype=bool),  # Will fill based on time
    'hours_already_worked': np.random.gamma(3, 1, n_samples),
    'accepted_ride': np.zeros(n_samples, dtype=bool)  # Will fill this based on other features
}

# Create DataFrame
df = pd.DataFrame(data)

# Extract time features
df['datetime'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day

# Generate seasonal weather patterns
def generate_weather(row):
    month = row['month']
    random_factor = np.random.random()
    
    # May-June: Mostly clear, some light rain
    if month in [5, 6]:
        if random_factor < 0.6:
            return 'Clear'
        elif random_factor < 0.8:
            return 'Cloudy'
        else:
            return 'Light Rain'
    
    # July-August: Monsoon season, more rain
    elif month in [7, 8]:
        if random_factor < 0.3:
            return 'Clear'
        elif random_factor < 0.5:
            return 'Cloudy'
        elif random_factor < 0.8:
            return 'Light Rain'
        else:
            return 'Heavy Rain'
    
    return 'Clear'  # Default

df['weather_condition'] = df.apply(generate_weather, axis=1)

# Generate traffic patterns based on time and day
def generate_traffic(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    random_factor = np.random.random()
    
    # Rush hour weekday traffic
    if day < 5:  # Weekday
        if (8 <= hour <= 10) or (17 <= hour <= 19):  # Morning/evening rush
            if random_factor < 0.4:
                return 'High'
            elif random_factor < 0.7:
                return 'Severe'
            else:
                return 'Medium'
        elif (7 <= hour <= 11) or (16 <= hour <= 20):  # Extended rush periods
            if random_factor < 0.4:
                return 'Medium'
            elif random_factor < 0.7:
                return 'High'
            else:
                return 'Low'
    
    # Weekend traffic patterns
    if day >= 5:  # Weekend
        if 11 <= hour <= 20:  # Daytime shopping/leisure
            if random_factor < 0.5:
                return 'Medium'
            elif random_factor < 0.8:
                return 'High'
            else:
                return 'Low'
    
    # Late night traffic usually light
    if 22 <= hour or hour <= 5:
        if random_factor < 0.8:
            return 'Low'
        else:
            return 'Medium'
    
    # Default times - mixed
    if random_factor < 0.4:
        return 'Low'
    elif random_factor < 0.8:
        return 'Medium'
    else:
        return 'High'

df['traffic_congestion_level'] = df.apply(generate_traffic, axis=1)

# Determine peak hours (weekdays 7-10 AM and 5-8 PM, weekends 11 AM-8 PM)
def is_peak_hour(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    
    if day < 5:  # Weekday
        return (7 <= hour <= 10) or (17 <= hour <= 20)
    else:  # Weekend
        return 11 <= hour <= 20

df['is_peak_hour'] = df.apply(is_peak_hour, axis=1)

# Calculate estimated trip time based on distance and traffic
def estimate_trip_time(row):
    distance = row['estimated_trip_distance_km']
    
    # Base speed in km/min (converted from km/h)
    if row['traffic_congestion_level'] == 'Low':
        speed = 0.5  # 30 km/h
    elif row['traffic_congestion_level'] == 'Medium':
        speed = 0.4  # 24 km/h
    elif row['traffic_congestion_level'] == 'High':
        speed = 0.3  # 18 km/h
    else:  # Severe
        speed = 0.2  # 12 km/h
    
    # Adjust speed based on weather
    if row['weather_condition'] == 'Heavy Rain':
        speed *= 0.8
    elif row['weather_condition'] == 'Light Rain':
        speed *= 0.9
    
    # Adjust speed based on service type (bikes are faster in traffic)
    if row['service_type'] == 'Bike' and row['traffic_congestion_level'] in ['High', 'Severe']:
        speed *= 1.3
    
    # Calculate time with a minimum of 5 minutes
    return max(5, distance / speed)

df['estimated_trip_time_min'] = df.apply(estimate_trip_time, axis=1)

# Calculate fare amount based on Namma Yatri pricing strategy with some seasonal adjustments
def calculate_namma_yatri_fare(row):
    distance = row['estimated_trip_distance_km']
    time = row['estimated_trip_time_min']
    service = row['service_type']
    month = row['month']
    
    # Seasonal base price variations
    month_factor = 1.0
    if month in [6, 7]:  # Slight increase in rainy season
        month_factor = 1.05
    
    # Set pricing based on service type for Namma Yatri
    if service == 'Auto':
        base_fare = np.random.uniform(30, 40) * month_factor
        per_km_rate = np.random.uniform(12, 15) * month_factor
        per_min_rate = np.random.uniform(1, 2) * month_factor
    else:  # Bike
        base_fare = 30 * month_factor
        per_km_rate = np.random.uniform(10, 12) * month_factor
        per_min_rate = 1 * month_factor
    
    # Calculate fare components
    distance_fare = per_km_rate * distance
    time_fare = per_min_rate * time
    
    # Add a small random factor (±5%) to simulate price variations
    random_factor = np.random.uniform(0.95, 1.05)
    
    # Total fare (Namma Yatri has no surge)
    total_fare = (base_fare + distance_fare + time_fare) * random_factor
    
    # Round to nearest 5 rupees
    return round(total_fare / 5) * 5

df['fare_amount'] = df.apply(calculate_namma_yatri_fare, axis=1)

# For Namma Yatri, driver earnings equal fare amount (0% commission)
df['driver_earnings'] = df['fare_amount']

# Generate driver data with more variation
# Create a dictionary to track driver stats
driver_stats = {}
for driver_id in driver_ids:
    # Generate varied driver characteristics
    driver_stats[driver_id] = {
        'experience_months': np.random.randint(1, 36),  # 1 month to 3 years experience
        'vehicle_quality': np.random.choice(['Low', 'Medium', 'High'], p=[0.2, 0.6, 0.2]),
        'preferred_areas': np.random.choice(['Urban', 'Suburban', 'Mixed'], p=[0.4, 0.3, 0.3]),
    }

# Add driver characteristics to main dataframe
df['driver_experience_months'] = df['driver_id'].map(lambda x: driver_stats[x]['experience_months'])
df['vehicle_quality'] = df['driver_id'].map(lambda x: driver_stats[x]['vehicle_quality'])
df['preferred_areas'] = df['driver_id'].map(lambda x: driver_stats[x]['preferred_areas'])

# Define acceptance probability based on features with more complexity
def calculate_acceptance_probability(row):
    # Base probability - INCREASED to achieve higher acceptance rate
    prob = 0.85  # Higher base probability to target 87.8% acceptance
    
    # Adjust based on historical acceptance rate (strong factor)
    prob += 0.15 * row['historical_acceptance_rate']
    
    # Adjust based on pickup distance (negative factor)
    prob -= 0.05 * min(row['distance_to_pickup_km'], 10) / 2
    
    # Adjust based on earnings (positive factor)
    earnings_factor = min(row['driver_earnings'] / 100, 1)  # Cap at 1
    prob += 0.15 * earnings_factor
    
    # Adjust based on trip distance
    if row['estimated_trip_distance_km'] > 20:
        prob -= 0.06
    elif row['estimated_trip_distance_km'] > 15:
        prob -= 0.03
    elif 5 <= row['estimated_trip_distance_km'] <= 15:
        prob += 0.04  # Preference for medium-distance trips
    
    # Adjust based on weather
    if row['weather_condition'] == 'Heavy Rain':
        prob -= 0.15
    elif row['weather_condition'] == 'Light Rain':
        prob -= 0.06
    elif row['weather_condition'] == 'Cloudy':
        prob -= 0.02
    
    # Adjust based on traffic
    if row['traffic_congestion_level'] == 'High':
        prob -= 0.07
    elif row['traffic_congestion_level'] == 'Severe':
        prob -= 0.12
    elif row['traffic_congestion_level'] == 'Medium':
        prob -= 0.02
    
    # Adjust based on hours worked
    if row['hours_already_worked'] > 8:
        prob -= 0.2 * min((row['hours_already_worked'] - 8) / 4, 1)
    elif row['hours_already_worked'] > 6:
        prob -= 0.08
    
    # Adjust based on time of day
    if row['is_peak_hour']:
        prob += 0.06  # Higher during peak hours due to more ride opportunities
    elif row['hour_of_day'] in [2, 3, 4, 5]:  # Late night/early morning
        prob -= 0.10
    
    # Adjust based on day of week
    if row['day_of_week'] >= 5:  # Weekend
        prob += 0.04
    
    # Adjust based on driver experience
    if row['driver_experience_months'] > 24:
        prob += 0.05  # Experienced drivers more likely to accept varied rides
    elif row['driver_experience_months'] < 6:
        prob -= 0.06  # New drivers may be more selective
    
    # Adjust based on vehicle quality
    if row['vehicle_quality'] == 'Low' and row['estimated_trip_distance_km'] > 15:
        prob -= 0.06
    elif row['vehicle_quality'] == 'Medium' and row['estimated_trip_distance_km'] > 18:
        prob -= 0.02
    
    # Adjust based on preferred areas
    if row['preferred_areas'] == 'Urban' and row['traffic_congestion_level'] in ['High', 'Severe']:
        prob -= 0.03
    elif row['preferred_areas'] == 'Suburban' and row['estimated_trip_distance_km'] < 5:
        prob -= 0.03
    
    # Adjust based on service type
    if row['service_type'] == 'Bike' and row['weather_condition'] in ['Light Rain', 'Heavy Rain']:
        prob -= 0.04
    
    # Add random variation
    prob += np.random.normal(0, 0.04)
    
    # Ensure probability is between 0.01 and 0.99
    return max(0.01, min(0.99, prob))

# Calculate acceptance probability and determine acceptance
df['acceptance_probability'] = df.apply(calculate_acceptance_probability, axis=1)
df['accepted_ride'] = df['acceptance_probability'].apply(lambda x: np.random.random() < x)

# Convert boolean to int for easier analysis
df['accepted_ride'] = df['accepted_ride'].astype(int)

# Calculate and print class distribution
acceptance_rate = df['accepted_ride'].mean() * 100
rejection_rate = 100 - acceptance_rate
print(f"Class distribution: Accepted {acceptance_rate:.2f}% : Rejected {rejection_rate:.2f}%")

# Target acceptance rate
target_acceptance = 87.8  # The target acceptance rate

# If the acceptance rate is not close to target, adjust it
if abs(acceptance_rate - target_acceptance) > 0.5:  # If more than 0.5% off target
    if acceptance_rate > target_acceptance:
        # Too many accepted rides - flip some to rejected
        current_accepted = df['accepted_ride'].sum()
        target_accepted = int(target_acceptance * n_samples / 100)
        to_flip = current_accepted - target_accepted
        
        if to_flip > 0:
            # Get indices of accepted rides
            accepted_indices = df[df['accepted_ride'] == 1].index.tolist()
            # Randomly select indices to flip
            flip_indices = np.random.choice(accepted_indices, size=to_flip, replace=False)
            # Flip these from accepted to rejected
            df.loc[flip_indices, 'accepted_ride'] = 0
    else:
        # Too few accepted rides - flip some to accepted
        current_accepted = df['accepted_ride'].sum()
        target_accepted = int(target_acceptance * n_samples / 100)
        to_flip = target_accepted - current_accepted
        
        if to_flip > 0:
            # Get indices of rejected rides
            rejected_indices = df[df['accepted_ride'] == 0].index.tolist()
            # Randomly select indices to flip
            flip_indices = np.random.choice(rejected_indices, size=to_flip, replace=False)
            # Flip these from rejected to accepted
            df.loc[flip_indices, 'accepted_ride'] = 1
    
    # Recalculate and print new class distribution
    new_acceptance_rate = df['accepted_ride'].mean() * 100
    new_rejection_rate = 100 - new_acceptance_rate
    print(f"Adjusted class distribution: Accepted {new_acceptance_rate:.2f}% : Rejected {new_rejection_rate:.2f}%")

# Reorder columns for clarity
column_order = [
    'driver_id', 'timestamp', 'month', 'day', 'hour_of_day', 'day_of_week', 'is_peak_hour',
    'service_type', 'driver_experience_months', 'vehicle_quality', 'preferred_areas',
    'historical_acceptance_rate', 'distance_to_pickup_km', 'estimated_trip_distance_km', 
    'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
    'weather_condition', 'traffic_congestion_level', 'hours_already_worked', 
    'acceptance_probability', 'accepted_ride'
]
df = df[column_order]

# Print sample of the data
print(df.head(10))
print("\nDataset summary statistics:")
print(df.describe())

# Save to CSV
df.to_csv('namma_yatri_imbalanced_dataset.csv', index=False)
print("\nDataset saved to 'namma_yatri_imbalanced_dataset.csv'")

Class distribution: Accepted 95.94% : Rejected 4.06%
Adjusted class distribution: Accepted 87.80% : Rejected 12.20%
  driver_id            timestamp  month  day  hour_of_day  day_of_week  \
0      D355  2024-06-25 11:05:52      6   25           11            1   
1      D913  2024-07-31 20:09:15      7   31           20            2   
2      D490  2024-08-03 06:52:02      8    3            6            5   
3      D568  2024-05-06 11:07:43      5    6           11            0   
4      D447  2024-05-17 20:37:06      5   17           20            4   
5      D898  2024-07-24 18:54:31      7   24           18            2   
6      D972  2024-07-21 03:20:34      7   21            3            6   
7      D923  2024-06-02 15:45:42      6    2           15            6   
8      D357  2024-08-23 02:04:06      8   23            2            4   
9      D400  2024-07-01 17:49:48      7    1           17            0   

   is_peak_hour service_type  driver_experience_months vehicle_qualit

In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(124)  # Different seed for more variation

# Number of samples to generate
n_samples = 50000  # Increased sample size

# Generate driver IDs (1000 unique drivers)
driver_ids = [f"D{str(i).zfill(3)}" for i in range(1, 1001)]  # More unique drivers

# Generate service types (auto, bike) for Namma Yatri
service_types = np.random.choice(['Auto', 'Bike'], n_samples, p=[0.7, 0.3])

# Generate timestamps efficiently
start_date = datetime(2024, 5, 1)
end_date = datetime(2024, 8, 31)
date_range = (end_date - start_date).total_seconds()

timestamps = [start_date + timedelta(seconds=np.random.uniform(0, date_range)) for _ in range(n_samples)]

# Generate data with different time period (May-August 2024)
data = {
    'driver_id': np.random.choice(driver_ids, n_samples),
    'timestamp': [ts.strftime("%Y-%m-%d %H:%M:%S") for ts in timestamps],
    'service_type': service_types,
    'historical_acceptance_rate': np.random.beta(7, 3, n_samples),
    'distance_to_pickup_km': np.random.exponential(2, n_samples),
    'estimated_trip_distance_km': np.random.lognormal(2, 0.5, n_samples),
    'estimated_trip_time_min': np.zeros(n_samples),  # Will calculate based on distance and traffic
    'weather_condition': np.zeros(n_samples, dtype=object),  # Will fill based on month and random factors
    'traffic_congestion_level': np.zeros(n_samples, dtype=object),  # Will fill based on time and day
    'is_peak_hour': np.zeros(n_samples, dtype=bool),  # Will fill based on time
    'hours_already_worked': np.random.gamma(3, 1, n_samples),
    'accepted_ride': np.zeros(n_samples, dtype=bool)  # Will fill this based on other features
}

# Create DataFrame
df = pd.DataFrame(data)

# Extract time features
df['datetime'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day

# Generate more extreme seasonal weather patterns
def generate_weather(row):
    month = row['month']
    random_factor = np.random.random()
    hour = row['hour_of_day']
    
    # May-June: Mostly clear, some light rain (more extreme in evenings)
    if month in [5, 6]:
        if hour >= 16 and random_factor < 0.4:  # Evening thunderstorms more likely
            return 'Heavy Rain'
        elif random_factor < 0.5:
            return 'Clear'
        elif random_factor < 0.7:
            return 'Cloudy'
        else:
            return 'Light Rain'
    
    # July-August: Intense monsoon season
    elif month in [7, 8]:
        if random_factor < 0.25:
            return 'Clear'
        elif random_factor < 0.4:
            return 'Cloudy'
        elif random_factor < 0.6:
            return 'Light Rain'
        else:
            return 'Heavy Rain'
    
    return 'Clear'  # Default

df['weather_condition'] = df.apply(generate_weather, axis=1)

# Generate more extreme traffic patterns based on time and day
def generate_traffic(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    month = row['month']
    random_factor = np.random.random()
    
    # More extreme rush hour weekday traffic (especially in July-August rainy season)
    seasonal_factor = 1.2 if month in [7, 8] else 1.0
    
    if day < 5:  # Weekday
        if (8 <= hour <= 10) or (17 <= hour <= 19):  # Core rush hours
            if random_factor < 0.3 * seasonal_factor:
                return 'Severe'
            elif random_factor < 0.7 * seasonal_factor:
                return 'High'
            else:
                return 'Medium'
        elif (7 <= hour <= 11) or (16 <= hour <= 20):  # Extended rush periods
            if random_factor < 0.2 * seasonal_factor:
                return 'Severe'
            elif random_factor < 0.5 * seasonal_factor:
                return 'High'
            else:
                return 'Medium'
    
    # Weekend traffic patterns - more congested during rainy season
    if day >= 5:  # Weekend
        if 11 <= hour <= 20:  # Daytime shopping/leisure
            if random_factor < 0.3 * seasonal_factor:
                return 'High'
            elif random_factor < 0.6 * seasonal_factor:
                return 'Medium'
            else:
                return 'Low'
    
    # Late night traffic usually light
    if 22 <= hour or hour <= 5:
        if random_factor < 0.7:
            return 'Low'
        else:
            return 'Medium'
    
    # Default times - mixed
    if random_factor < 0.3:
        return 'Low'
    elif random_factor < 0.7:
        return 'Medium'
    else:
        return 'High'

df['traffic_congestion_level'] = df.apply(generate_traffic, axis=1)

# Determine peak hours (weekdays 7-10 AM and 5-8 PM, weekends 11 AM-8 PM)
def is_peak_hour(row):
    hour = row['hour_of_day']
    day = row['day_of_week']
    
    if day < 5:  # Weekday
        return (7 <= hour <= 10) or (17 <= hour <= 20)
    else:  # Weekend
        return 11 <= hour <= 20

df['is_peak_hour'] = df.apply(is_peak_hour, axis=1)

# Calculate estimated trip time based on distance and traffic with MORE EXTREME variations
def estimate_trip_time(row):
    distance = row['estimated_trip_distance_km']
    
    # Base speed in km/min (converted from km/h) - more extreme differences
    if row['traffic_congestion_level'] == 'Low':
        speed = 0.6  # 36 km/h
    elif row['traffic_congestion_level'] == 'Medium':
        speed = 0.4  # 24 km/h
    elif row['traffic_congestion_level'] == 'High':
        speed = 0.25  # 15 km/h
    else:  # Severe
        speed = 0.15  # 9 km/h - extremely slow
    
    # More extreme weather adjustments
    if row['weather_condition'] == 'Heavy Rain':
        speed *= 0.6  # Much slower in heavy rain
    elif row['weather_condition'] == 'Light Rain':
        speed *= 0.8  # Moderately slower in light rain
    
    # Adjust speed based on service type (bikes are faster in traffic)
    if row['service_type'] == 'Bike' and row['traffic_congestion_level'] in ['High', 'Severe']:
        speed *= 1.5  # Bikes much more advantageous in heavy traffic
    
    # Calculate time with a minimum of 5 minutes
    return max(5, distance / speed)

df['estimated_trip_time_min'] = df.apply(estimate_trip_time, axis=1)

# Calculate fare amount based on Namma Yatri pricing with STRONGER seasonal and traffic adjustments
def calculate_namma_yatri_fare(row):
    distance = row['estimated_trip_distance_km']
    time = row['estimated_trip_time_min']
    service = row['service_type']
    month = row['month']
    weather = row['weather_condition']
    traffic = row['traffic_congestion_level']
    
    # More extreme seasonal price variations
    month_factor = 1.0
    if month in [7, 8]:  # Significant increase during peak monsoon
        month_factor = 1.15
    elif month in [5, 6]:  # Slight increase in early monsoon
        month_factor = 1.05
    
    # Weather factor - higher fares in worse weather
    weather_factor = 1.0
    if weather == 'Heavy Rain':
        weather_factor = 1.2
    elif weather == 'Light Rain':
        weather_factor = 1.1
    
    # Traffic factor - higher rates during congestion
    traffic_factor = 1.0
    if traffic == 'Severe':
        traffic_factor = 1.15
    elif traffic == 'High':
        traffic_factor = 1.1
    
    # Set pricing based on service type for Namma Yatri
    if service == 'Auto':
        base_fare = np.random.uniform(30, 40) * month_factor
        per_km_rate = np.random.uniform(12, 15) * month_factor * weather_factor
        per_min_rate = np.random.uniform(1, 2) * month_factor * traffic_factor
    else:  # Bike
        base_fare = 30 * month_factor
        per_km_rate = np.random.uniform(10, 12) * month_factor * weather_factor
        per_min_rate = 1 * month_factor * traffic_factor
    
    # Calculate fare components
    distance_fare = per_km_rate * distance
    time_fare = per_min_rate * time
    
    # Add a small random factor (±5%) to simulate price variations
    random_factor = np.random.uniform(0.95, 1.05)
    
    # Total fare (Namma Yatri has no surge, but we're adding weather/traffic factors)
    total_fare = (base_fare + distance_fare + time_fare) * random_factor
    
    # Round to nearest 5 rupees
    return round(total_fare / 5) * 5

df['fare_amount'] = df.apply(calculate_namma_yatri_fare, axis=1)

# For Namma Yatri, driver earnings equal fare amount (0% commission)
df['driver_earnings'] = df['fare_amount']

# Generate driver data with more extreme variation
# Create a dictionary to track driver stats
driver_stats = {}
for driver_id in driver_ids:
    # Generate varied driver characteristics
    driver_stats[driver_id] = {
        'experience_months': np.random.randint(1, 36),  # 1 month to 3 years experience
        'vehicle_quality': np.random.choice(['Low', 'Medium', 'High'], p=[0.2, 0.6, 0.2]),
        'preferred_areas': np.random.choice(['Urban', 'Suburban', 'Mixed'], p=[0.4, 0.3, 0.3]),
    }

# Add driver characteristics to main dataframe
df['driver_experience_months'] = df['driver_id'].map(lambda x: driver_stats[x]['experience_months'])
df['vehicle_quality'] = df['driver_id'].map(lambda x: driver_stats[x]['vehicle_quality'])
df['preferred_areas'] = df['driver_id'].map(lambda x: driver_stats[x]['preferred_areas'])

# Define acceptance probability based on features with MUCH STRONGER relationships
def calculate_acceptance_probability(row):
    # Base probability
    prob = 0.83  # Slightly lower base to allow for stronger feature effects
    
    # --- INCREASED IMPACT FACTORS ---
    
    # Strong impact from historical acceptance rate
    prob += 0.2 * row['historical_acceptance_rate']
    
    # Distance to pickup - STRONGER negative impact
    prob -= 0.12 * min(row['distance_to_pickup_km'], 10) / 2  # Increased from 0.05 to 0.12
    
    # EARNINGS - MUCH STRONGER positive impact
    earnings_factor = min(row['driver_earnings'] / 100, 1.5)  # Increased cap to 1.5
    prob += 0.25 * earnings_factor  # Increased from 0.15 to 0.25
    
    # Trip distance - STRONGER preferences
    if row['estimated_trip_distance_km'] > 20:
        prob -= 0.15  # Increased from 0.06 to 0.15
    elif row['estimated_trip_distance_km'] > 15:
        prob -= 0.08  # Increased from 0.03 to 0.08
    elif 5 <= row['estimated_trip_distance_km'] <= 15:
        prob += 0.09  # Increased from 0.04 to 0.09
    
    # WEATHER - MUCH STRONGER impact
    if row['weather_condition'] == 'Heavy Rain':
        prob -= 0.30  # Doubled from 0.15 to 0.30
    elif row['weather_condition'] == 'Light Rain':
        prob -= 0.15  # Increased from 0.06 to 0.15
    elif row['weather_condition'] == 'Cloudy':
        prob -= 0.05  # Increased from 0.02 to 0.05
    
    # TRAFFIC - MUCH STRONGER impact
    if row['traffic_congestion_level'] == 'Severe':
        prob -= 0.25  # Doubled from 0.12 to 0.25
    elif row['traffic_congestion_level'] == 'High':
        prob -= 0.15  # Doubled from 0.07 to 0.15
    elif row['traffic_congestion_level'] == 'Medium':
        prob -= 0.05  # Increased from 0.02 to 0.05
    
    # Hours worked - STRONGER impact
    if row['hours_already_worked'] > 8:
        prob -= 0.35 * min((row['hours_already_worked'] - 8) / 4, 1)  # Increased from 0.2 to 0.35
    elif row['hours_already_worked'] > 6:
        prob -= 0.15  # Increased from 0.08 to 0.15
    
    # Time of day - STRONGER impact
    if row['is_peak_hour']:
        prob += 0.12  # Doubled from 0.06 to 0.12
    elif row['hour_of_day'] in [2, 3, 4, 5]:  # Late night/early morning
        prob -= 0.20  # Doubled from 0.10 to 0.20
    
    # Day of week - STRONGER impact
    if row['day_of_week'] >= 5:  # Weekend
        prob += 0.08  # Doubled from 0.04 to 0.08
    
    # Driver experience - STRONGER impact
    if row['driver_experience_months'] > 24:
        prob += 0.10  # Doubled from 0.05 to 0.10
    elif row['driver_experience_months'] < 6:
        prob -= 0.12  # Doubled from 0.06 to 0.12
    
    # Vehicle quality - STRONGER impact
    if row['vehicle_quality'] == 'Low' and row['estimated_trip_distance_km'] > 15:
        prob -= 0.14  # Increased from 0.06 to 0.14
    elif row['vehicle_quality'] == 'Medium' and row['estimated_trip_distance_km'] > 18:
        prob -= 0.06  # Increased from 0.02 to 0.06
    
    # Preferred areas - STRONGER impact
    if row['preferred_areas'] == 'Urban' and row['traffic_congestion_level'] in ['High', 'Severe']:
        prob -= 0.08  # Increased from 0.03 to 0.08
    elif row['preferred_areas'] == 'Suburban' and row['estimated_trip_distance_km'] < 5:
        prob -= 0.08  # Increased from 0.03 to 0.08
    
    # Service type - STRONGER impact with weather
    if row['service_type'] == 'Bike' and row['weather_condition'] in ['Light Rain', 'Heavy Rain']:
        prob -= 0.18  # Significantly increased from 0.04 to 0.18
        if row['weather_condition'] == 'Heavy Rain':
            prob -= 0.12  # Additional penalty for bikes in heavy rain
    
    # Add combined effect of several factors
    # If it's a severe traffic + heavy rain + bike + long distance, dramatically reduce acceptance
    if (row['traffic_congestion_level'] == 'Severe' and 
        row['weather_condition'] == 'Heavy Rain' and 
        row['service_type'] == 'Bike' and
        row['estimated_trip_distance_km'] > 15):
        prob -= 0.30  # Severe combined penalty
    
    # Add high fare incentive: If fare is very high, increase acceptance likelihood
    if row['fare_amount'] > 200:
        prob += 0.15
    
    # Special time interaction: Drivers more likely to accept short trips at end of day
    if row['hour_of_day'] >= 21 and row['estimated_trip_distance_km'] < 10:
        prob += 0.10
    
    # Add random variation (reduced slightly to allow stronger deterministic factors)
    prob += np.random.normal(0, 0.035)
    
    # Ensure probability is between 0.01 and 0.99
    return max(0.01, min(0.99, prob))

# Calculate acceptance probability and determine acceptance
df['acceptance_probability'] = df.apply(calculate_acceptance_probability, axis=1)
df['accepted_ride'] = df['acceptance_probability'].apply(lambda x: np.random.random() < x)

# Convert boolean to int for easier analysis
df['accepted_ride'] = df['accepted_ride'].astype(int)

# Calculate and print class distribution
acceptance_rate = df['accepted_ride'].mean() * 100
rejection_rate = 100 - acceptance_rate
print(f"Class distribution: Accepted {acceptance_rate:.2f}% : Rejected {rejection_rate:.2f}%")

# Target acceptance rate
target_acceptance = 87.8  # The target acceptance rate

# If the acceptance rate is not close to target, adjust it
if abs(acceptance_rate - target_acceptance) > 0.5:  # If more than 0.5% off target
    if acceptance_rate > target_acceptance:
        # Too many accepted rides - flip some to rejected
        current_accepted = df['accepted_ride'].sum()
        target_accepted = int(target_acceptance * n_samples / 100)
        to_flip = current_accepted - target_accepted
        
        if to_flip > 0:
            # Get indices of accepted rides with lowest probabilities
            accepted_indices = df[df['accepted_ride'] == 1].sort_values('acceptance_probability').index[:to_flip].tolist()
            # Flip these from accepted to rejected
            df.loc[accepted_indices, 'accepted_ride'] = 0
    else:
        # Too few accepted rides - flip some to accepted
        current_accepted = df['accepted_ride'].sum()
        target_accepted = int(target_acceptance * n_samples / 100)
        to_flip = target_accepted - current_accepted
        
        if to_flip > 0:
            # Get indices of rejected rides with highest probabilities
            rejected_indices = df[df['accepted_ride'] == 0].sort_values('acceptance_probability', ascending=False).index[:to_flip].tolist()
            # Flip these from rejected to accepted
            df.loc[rejected_indices, 'accepted_ride'] = 1
    
    # Recalculate and print new class distribution
    new_acceptance_rate = df['accepted_ride'].mean() * 100
    new_rejection_rate = 100 - new_acceptance_rate
    print(f"Adjusted class distribution: Accepted {new_acceptance_rate:.2f}% : Rejected {new_rejection_rate:.2f}%")

# Create correlation matrix and check correlations with accepted_ride
print("\nCorrelation with accepted_ride:")
numeric_df = df.select_dtypes(include=['number'])
correlations = numeric_df.corr()['accepted_ride'].sort_values(ascending=False)
print(correlations.drop('accepted_ride'))

# Calculate correlation ratios for categorical features
def correlation_ratio(categories, measurements):
    categories = pd.Categorical(categories)
    measurements = np.array(measurements)
    
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures) if len(cat_measures) > 0 else 0
    
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.square(np.subtract(y_avg_array, y_total_avg))))
    denominator = np.sum(np.square(np.subtract(measurements, y_total_avg)))
    
    if denominator == 0:
        return 0
    return np.sqrt(numerator / denominator)

# Calculate correlation ratio for categorical variables
categorical_cols = ['weather_condition', 'traffic_congestion_level', 'service_type', 
                    'vehicle_quality', 'preferred_areas']
cat_correlations = {}

for col in categorical_cols:
    cat_correlations[col] = correlation_ratio(df[col], df['accepted_ride'])

print("\nCategorical variable correlation ratios:")
for col, corr in sorted(cat_correlations.items(), key=lambda x: x[1], reverse=True):
    print(f"{col}: {corr:.4f}")

# Reorder columns for clarity
column_order = [
    'driver_id', 'timestamp', 'month', 'day', 'hour_of_day', 'day_of_week', 'is_peak_hour',
    'service_type', 'driver_experience_months', 'vehicle_quality', 'preferred_areas',
    'historical_acceptance_rate', 'distance_to_pickup_km', 'estimated_trip_distance_km', 
    'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
    'weather_condition', 'traffic_congestion_level', 'hours_already_worked', 
    'acceptance_probability', 'accepted_ride'
]
df = df[column_order]

# Print sample of the data
print("\nSample data:")
print(df.head(5))
print("\nDataset summary statistics:")
print(df.describe())

# Save to CSV
df.to_csv('namma_yatri_imbalanced_dataset_stronger_correlations.csv', index=False)
print("\nDataset saved to 'namma_yatri_imbalanced_dataset_stronger_correlations.csv'")

Class distribution: Accepted 92.40% : Rejected 7.60%
Adjusted class distribution: Accepted 87.80% : Rejected 12.20%

Correlation with accepted_ride:
acceptance_probability        0.772444
driver_experience_months      0.099762
estimated_trip_distance_km    0.090162
fare_amount                   0.067320
driver_earnings               0.067320
day_of_week                   0.065728
historical_acceptance_rate    0.049123
hour_of_day                   0.026063
day                          -0.008447
estimated_trip_time_min      -0.023711
hours_already_worked         -0.041750
month                        -0.079372
distance_to_pickup_km        -0.267750
Name: accepted_ride, dtype: float64

Categorical variable correlation ratios:
weather_condition: 0.2824
service_type: 0.2513
traffic_congestion_level: 0.0789
preferred_areas: 0.0294
vehicle_quality: 0.0022

Sample data:
  driver_id            timestamp  month  day  hour_of_day  day_of_week  \
0      D355  2024-06-25 11:05:52      6   25      