In [33]:
import pandas as pd 
df = pd.read_csv("rides_data.csv")

In [34]:
df.head()

Unnamed: 0,services,date,time,ride_status,source,destination,duration,ride_id,distance,ride_charge,misc_charge,total_fare,payment_method
0,cab economy,2024-07-15,08:30:40.542646,completed,Balagere Harbor,Harohalli Nagar,39,RD3161218751875354,27.21,764.83,31.51,796.34,Amazon Pay
1,auto,2024-07-05,23:36:51.542646,completed,Basavanagudi 3rd Block,Bikasipura 1st Stage,89,RD8171514284594096,34.03,314.83,49.52,364.35,Paytm
2,auto,2024-07-23,11:05:37.542646,cancelled,Babusapalya Cove,Kothaguda Terrace,25,RD9376481122237926,20.24,,,,
3,cab economy,2024-06-24,08:45:10.542646,completed,Mahadevapura Mews,Kanakapura Arc,89,RD3676889143182765,31.17,484.73,15.84,500.57,QR scan
4,cab economy,2024-07-15,00:26:44.542646,completed,Ganganagar Cove,Basaveshwaranagar Colony,95,RD6639410275948084,27.21,663.5,14.13,677.63,Amazon Pay


In [35]:
df.columns

Index(['services', 'date', 'time', 'ride_status', 'source', 'destination',
       'duration', 'ride_id', 'distance', 'ride_charge', 'misc_charge',
       'total_fare', 'payment_method'],
      dtype='object')

####  busiest hour for each service

In [36]:

# Combine date and time for proper datetime format
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

# Extract hour from datetime
df['hour'] = df['datetime'].dt.hour

# Create time range format
df['time_range'] = df['hour'].apply(lambda x: f"{x:02d}:00 - {x+1:02d}:00")

# Find the busiest time range for each service
result = df.groupby('services')['time_range'].agg(lambda x: x.mode().iloc[0]).reset_index()

print(result)

      services     time_range
0         auto  09:00 - 10:00
1         bike  13:00 - 14:00
2    bike lite  04:00 - 05:00
3  cab economy  11:00 - 12:00
4       parcel  12:00 - 13:00


In [37]:
# Most used Source-Destination pairs
source_dest_pairs = df.groupby(['source', 'destination']).size().reset_index(name='count').sort_values(by='count', ascending=False)

# Most frequent sources
most_sources = df['source'].value_counts().reset_index().rename(columns={'index': 'source', 'source': 'count'})

# Most frequent destinations
most_destinations = df['destination'].value_counts().reset_index().rename(columns={'index': 'destination', 'destination': 'count'})

print("Most Used Source-Destination Pairs:\n", source_dest_pairs)
print("\nMost Frequent Sources:\n", most_sources)
print("\nMost Frequent Destinations:\n", most_destinations)

Most Used Source-Destination Pairs:
                         source                  destination  count
18379      HSR Layout Crescent              Tavarekere View      2
24976            Jayanagar Cut            Yelahanka Landing      2
37019      Naganathapura Close  Koramangala 7th Block Vista      2
1529              Anekal Woods            Yelahanka Complex      2
6462   Basaveshwaranagar Place         Kalena Agrahara Fork      2
...                        ...                          ...    ...
16668          Ganganagar Quay          Marathahalli Estate      1
16669          Ganganagar Quay          Pai Layout Quarters      1
16670          Ganganagar Quay           Rajajinagar Harbor      1
16671         Ganganagar Ridge  Basaveshwaranagar 4th Block      1
16659      Ganganagar Quadrant         Byatarayanapura Pass      1

[49993 rows x 3 columns]

Most Frequent Sources:
                           count  count
0              Kothanur Landing     23
1             Banaswadi Landin

In [38]:
import numpy as np

df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S.%f').dt.time

# Define time ranges
time_ranges = {
    'High': [(9, 10), (11, 12), (12, 13), (13, 14)],
    'Medium': [(4, 5)],
    'Low': [(0, 4), (14, 23)]
}

# Function to assign traffic based on time ranges
def assign_traffic(time):
    hour = time.hour
    for traffic_level, ranges in time_ranges.items():
        for start, end in ranges:
            if start <= hour < end:
                return traffic_level
    return 'Low'

# Assign traffic conditions
df['traffic'] = df['time'].apply(assign_traffic)

# Adjust traffic using ride frequency
source_dest_pairs = df.groupby(['source', 'destination']).size().reset_index(name='count')
quantiles = source_dest_pairs['count'].quantile([0.33, 0.66]).values
source_dest_pairs['traffic'] = np.select(
    [
        source_dest_pairs['count'] >= quantiles[1],
        source_dest_pairs['count'] >= quantiles[0],
    ],
    ['High', 'Medium'],
    default='Low'
)

print("Most Used Source-Destination Pairs with Traffic:\n", source_dest_pairs)
print("\nRides with Assigned Traffic Conditions:\n", df[['source', 'destination', 'time', 'traffic']])

Most Used Source-Destination Pairs with Traffic:
                   source                 destination  count traffic
0      Adugodi 1st Stage          Babusapalya Estate      1    High
1      Adugodi 1st Stage  Horamavu Banaswadi Cutting      1    High
2      Adugodi 1st Stage           Indiranagar Cross      1    High
3      Adugodi 1st Stage        Kanakapura Esplanade      1    High
4      Adugodi 1st Stage              Ulsoor Complex      1    High
...                  ...                         ...    ...     ...
49988  Yeshwanthpur Zone                Hosur Alcove      1    High
49989  Yeshwanthpur Zone              Kothanur Track      1    High
49990  Yeshwanthpur Zone                 MG Road Arc      1    High
49991  Yeshwanthpur Zone      Marathahalli Boulevard      1    High
49992  Yeshwanthpur Zone              Varthur Estate      1    High

[49993 rows x 4 columns]

Rides with Assigned Traffic Conditions:
                        source               destination           

In [39]:
df.head()

Unnamed: 0,services,date,time,ride_status,source,destination,duration,ride_id,distance,ride_charge,misc_charge,total_fare,payment_method,datetime,hour,time_range,traffic
0,cab economy,2024-07-15,08:30:40.542646,completed,Balagere Harbor,Harohalli Nagar,39,RD3161218751875354,27.21,764.83,31.51,796.34,Amazon Pay,2024-07-15 08:30:40.542646,8,08:00 - 09:00,Low
1,auto,2024-07-05,23:36:51.542646,completed,Basavanagudi 3rd Block,Bikasipura 1st Stage,89,RD8171514284594096,34.03,314.83,49.52,364.35,Paytm,2024-07-05 23:36:51.542646,23,23:00 - 24:00,Low
2,auto,2024-07-23,11:05:37.542646,cancelled,Babusapalya Cove,Kothaguda Terrace,25,RD9376481122237926,20.24,,,,,2024-07-23 11:05:37.542646,11,11:00 - 12:00,High
3,cab economy,2024-06-24,08:45:10.542646,completed,Mahadevapura Mews,Kanakapura Arc,89,RD3676889143182765,31.17,484.73,15.84,500.57,QR scan,2024-06-24 08:45:10.542646,8,08:00 - 09:00,Low
4,cab economy,2024-07-15,00:26:44.542646,completed,Ganganagar Cove,Basaveshwaranagar Colony,95,RD6639410275948084,27.21,663.5,14.13,677.63,Amazon Pay,2024-07-15 00:26:44.542646,0,00:00 - 01:00,Low


#### adding weather column

In [40]:
# Function to assign weather based on month
def assign_weather(date):
    month = pd.to_datetime(date).month
    if month in [3, 4, 5]:  # Summer
        return np.random.choice(['Clear', 'High Heat', 'Windy'], p=[0.6, 0.3, 0.1])
    elif month in [6, 7, 8, 9]:  # Monsoon
        return np.random.choice(['Rain', 'Heavy Rain', 'Cloudy'], p=[0.5, 0.3, 0.2])
    elif month in [10, 11]:  # Post-Monsoon
        return np.random.choice(['Cloudy', 'Rain', 'Clear'], p=[0.5, 0.3, 0.2])
    elif month in [12, 1, 2]:  # Winter
        return np.random.choice(['Clear', 'Cold', 'Fog'], p=[0.5, 0.3, 0.2])

# Assign weather based on date
df['weather'] = df['date'].apply(assign_weather)
df.head()

Unnamed: 0,services,date,time,ride_status,source,destination,duration,ride_id,distance,ride_charge,misc_charge,total_fare,payment_method,datetime,hour,time_range,traffic,weather
0,cab economy,2024-07-15,08:30:40.542646,completed,Balagere Harbor,Harohalli Nagar,39,RD3161218751875354,27.21,764.83,31.51,796.34,Amazon Pay,2024-07-15 08:30:40.542646,8,08:00 - 09:00,Low,Rain
1,auto,2024-07-05,23:36:51.542646,completed,Basavanagudi 3rd Block,Bikasipura 1st Stage,89,RD8171514284594096,34.03,314.83,49.52,364.35,Paytm,2024-07-05 23:36:51.542646,23,23:00 - 24:00,Low,Heavy Rain
2,auto,2024-07-23,11:05:37.542646,cancelled,Babusapalya Cove,Kothaguda Terrace,25,RD9376481122237926,20.24,,,,,2024-07-23 11:05:37.542646,11,11:00 - 12:00,High,Rain
3,cab economy,2024-06-24,08:45:10.542646,completed,Mahadevapura Mews,Kanakapura Arc,89,RD3676889143182765,31.17,484.73,15.84,500.57,QR scan,2024-06-24 08:45:10.542646,8,08:00 - 09:00,Low,Rain
4,cab economy,2024-07-15,00:26:44.542646,completed,Ganganagar Cove,Basaveshwaranagar Colony,95,RD6639410275948084,27.21,663.5,14.13,677.63,Amazon Pay,2024-07-15 00:26:44.542646,0,00:00 - 01:00,Low,Rain


#### Sample Code for Creating peak_score

In [41]:
def assign_peak_score(row):
    score = 0

    # Time-based scoring
    if row['time_range'] in ['09:00 - 10:00', '11:00 - 12:00', '12:00 - 13:00', '13:00 - 14:00']:
        score += 40
    elif row['time_range'] in ['04:00 - 05:00']:
        score += 30
    else:
        score += 10

    # Traffic-based scoring
    if row['traffic'] == 'High':
        score += 30
    elif row['traffic'] == 'Medium':
        score += 20
    else:
        score += 10

    # Weather-based scoring (example)
    if row['weather'] in ['Rainy', 'Storm']:
        score += 20
    elif row['weather'] == 'Cloudy':
        score += 10

    # Place-based scoring (if some areas are naturally busier)
    if row['source'] in ['Central', 'Market', 'Station']:
        score += 10

    return score

# Add Peak Score Column
df['peak_score'] = df.apply(assign_peak_score, axis=1)


In [42]:
unique_values = df['source'].unique()
unique_values

array(['Balagere Harbor', 'Basavanagudi 3rd Block', 'Babusapalya Cove',
       ..., 'Ashok Nagar 1st Stage', 'Jakkur Cross', 'Whitefield Grove'],
      shape=(12982,), dtype=object)

In [43]:
pd.DataFrame(df['source'].unique(), columns=['source']).to_csv('unique_source_values.csv', index=False)

#### Scaling the peak_score for Consistency

In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 100))
df['peak_score'] = scaler.fit_transform(df[['peak_score']])

In [45]:
df.head()

Unnamed: 0,services,date,time,ride_status,source,destination,duration,ride_id,distance,ride_charge,misc_charge,total_fare,payment_method,datetime,hour,time_range,traffic,weather,peak_score
0,cab economy,2024-07-15,08:30:40.542646,completed,Balagere Harbor,Harohalli Nagar,39,RD3161218751875354,27.21,764.83,31.51,796.34,Amazon Pay,2024-07-15 08:30:40.542646,8,08:00 - 09:00,Low,Rain,0.0
1,auto,2024-07-05,23:36:51.542646,completed,Basavanagudi 3rd Block,Bikasipura 1st Stage,89,RD8171514284594096,34.03,314.83,49.52,364.35,Paytm,2024-07-05 23:36:51.542646,23,23:00 - 24:00,Low,Heavy Rain,0.0
2,auto,2024-07-23,11:05:37.542646,cancelled,Babusapalya Cove,Kothaguda Terrace,25,RD9376481122237926,20.24,,,,,2024-07-23 11:05:37.542646,11,11:00 - 12:00,High,Rain,83.333333
3,cab economy,2024-06-24,08:45:10.542646,completed,Mahadevapura Mews,Kanakapura Arc,89,RD3676889143182765,31.17,484.73,15.84,500.57,QR scan,2024-06-24 08:45:10.542646,8,08:00 - 09:00,Low,Rain,0.0
4,cab economy,2024-07-15,00:26:44.542646,completed,Ganganagar Cove,Basaveshwaranagar Colony,95,RD6639410275948084,27.21,663.5,14.13,677.63,Amazon Pay,2024-07-15 00:26:44.542646,0,00:00 - 01:00,Low,Rain,0.0


#### basic preprocessing

In [46]:
df.columns

Index(['services', 'date', 'time', 'ride_status', 'source', 'destination',
       'duration', 'ride_id', 'distance', 'ride_charge', 'misc_charge',
       'total_fare', 'payment_method', 'datetime', 'hour', 'time_range',
       'traffic', 'weather', 'peak_score'],
      dtype='object')

In [47]:
df['datetime']

0       2024-07-15 08:30:40.542646
1       2024-07-05 23:36:51.542646
2       2024-07-23 11:05:37.542646
3       2024-06-24 08:45:10.542646
4       2024-07-15 00:26:44.542646
                   ...            
49995   2024-07-24 10:12:37.811393
49996   2024-08-12 11:24:08.811393
49997   2024-08-10 21:52:48.811393
49998   2024-07-07 20:44:00.811393
49999   2024-08-07 08:53:25.811393
Name: datetime, Length: 50000, dtype: datetime64[ns]

In [48]:
# Fill numerical NaN values with median
numeric_cols = ['ride_charge', 'misc_charge', 'total_fare']
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical NaN values with 'Unknown'
df['payment_method'].fillna('Unknown', inplace=True)

# Step 4: Drop rows with missing critical data
df.dropna(subset=['duration', 'distance'], inplace=True)

# Step 5: Reset index
df.reset_index(drop=True, inplace=True)

print(df.info())  # Confirm changes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   services        50000 non-null  object        
 1   date            50000 non-null  object        
 2   time            50000 non-null  object        
 3   ride_status     50000 non-null  object        
 4   source          50000 non-null  object        
 5   destination     50000 non-null  object        
 6   duration        50000 non-null  int64         
 7   ride_id         50000 non-null  object        
 8   distance        50000 non-null  float64       
 9   ride_charge     50000 non-null  float64       
 10  misc_charge     50000 non-null  float64       
 11  total_fare      50000 non-null  float64       
 12  payment_method  50000 non-null  object        
 13  datetime        50000 non-null  datetime64[ns]
 14  hour            50000 non-null  int32         
 15  ti

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [49]:

coordinates_data = pd.read_csv('unique_source_values_with_coordinates.csv')
df = df.merge(coordinates_data, how='left', left_on='source', right_on='source')


In [22]:
df.head()

Unnamed: 0,services,date,time,ride_status,source,destination,duration,ride_id,distance,ride_charge,...,total_fare,payment_method,datetime,hour,time_range,traffic,weather,peak_score,latitude,longitude
0,cab economy,2024-07-15,08:30:40.542646,completed,Balagere Harbor,Harohalli Nagar,39,RD3161218751875354,27.21,764.83,...,796.34,Amazon Pay,2024-07-15 08:30:40.542646,8,08:00 - 09:00,Low,Cloudy,16.666667,12.912362,77.527978
1,auto,2024-07-05,23:36:51.542646,completed,Basavanagudi 3rd Block,Bikasipura 1st Stage,89,RD8171514284594096,34.03,314.83,...,364.35,Paytm,2024-07-05 23:36:51.542646,23,23:00 - 24:00,Low,Rain,0.0,13.085214,77.776266
2,auto,2024-07-23,11:05:37.542646,cancelled,Babusapalya Cove,Kothaguda Terrace,25,RD9376481122237926,20.24,520.03,...,544.92,Unknown,2024-07-23 11:05:37.542646,11,11:00 - 12:00,High,Cloudy,100.0,13.019598,77.603188
3,cab economy,2024-06-24,08:45:10.542646,completed,Mahadevapura Mews,Kanakapura Arc,89,RD3676889143182765,31.17,484.73,...,500.57,QR scan,2024-06-24 08:45:10.542646,8,08:00 - 09:00,Low,Heavy Rain,0.0,12.979598,77.77244
4,cab economy,2024-07-15,00:26:44.542646,completed,Ganganagar Cove,Basaveshwaranagar Colony,95,RD6639410275948084,27.21,663.5,...,677.63,Amazon Pay,2024-07-15 00:26:44.542646,0,00:00 - 01:00,Low,Heavy Rain,0.0,12.846806,77.783463


#### save the updated data

In [50]:
df.to_csv('processed_rides_data.csv', index=False)

In [51]:
df.columns

Index(['services', 'date', 'time', 'ride_status', 'source', 'destination',
       'duration', 'ride_id', 'distance', 'ride_charge', 'misc_charge',
       'total_fare', 'payment_method', 'datetime', 'hour', 'time_range',
       'traffic', 'weather', 'peak_score', 'latitude', 'longitude'],
      dtype='object')

#### Training the ML model(RandomForest+Prophet)

In [61]:
import pandas as pd
from prophet import Prophet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load data
df = pd.read_csv('processed_rides_data.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df = df[['datetime', 'date', 'hour', 'latitude', 'longitude', 'traffic', 'weather', 'peak_score']].dropna()

# Encode categorical features
df = pd.get_dummies(df, columns=['weather'], drop_first=True)
df = pd.get_dummies(df, columns=['traffic'], drop_first=True)

# Prophet Model
prophet_df = df[['datetime', 'peak_score']].rename(columns={'datetime': 'ds', 'peak_score': 'y'})
prophet_model = Prophet()
prophet_model.fit(prophet_df)
future = prophet_model.make_future_dataframe(periods=30, freq='H')
prophet_forecast = prophet_model.predict(future)
prophet_peak_values = prophet_forecast[['ds', 'yhat']].rename(columns={'yhat': 'prophet_peak_score'})

# Random Forest Model
X = df.drop(['datetime', 'date', 'peak_score'], axis=1)
y = df['peak_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_peak_values = rf_model.predict(X_test)

# Combined Results
results = X_test.copy()
results['rf_peak_score'] = rf_peak_values
results['datetime'] = df['datetime'].iloc[X_test.index]

# Merging Prophet Forecast
df_final = results.merge(prophet_peak_values, left_on='datetime', right_on='ds', how='left').drop(['ds'], axis=1)

# Hybrid Prediction (Averaging Strategy)
df_final['final_peak_score'] = (df_final['rf_peak_score'] + df_final['prophet_peak_score']) / 2

# Final Output - CSV File
df_final[['latitude', 'longitude', 'final_peak_score']].to_csv('predicted_peak_scores.csv', index=False)

print("Predicted peak scores saved to 'predicted_peak_scores.csv'")


03:37:00 - cmdstanpy - INFO - Chain [1] start processing
03:37:01 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(


Predicted peak scores saved to 'predicted_peak_scores.csv'


In [62]:
# Accuracy Metrics
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, mean_squared_error

def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_true, y_pred):.2f}")
    print(f"RMSE: {root_mean_squared_error(y_true, y_pred,):.2f}")
    print(f"R² Score: {r2_score(y_true, y_pred):.2f}")

evaluate_model(y_test, rf_peak_values, "Random Forest")
evaluate_model(y_test, prophet_peak_values['prophet_peak_score'][:len(y_test)].values, "Prophet")
evaluate_model(y_test, df_final['final_peak_score'], "Hybrid Model")


Random Forest Performance:
MAE: 0.00
MSE: 0.00
RMSE: 0.00
R² Score: 1.00

Prophet Performance:
MAE: 28.58
MSE: 1683.00
RMSE: 41.02
R² Score: -0.58

Hybrid Model Performance:
MAE: 7.16
MSE: 116.53
RMSE: 10.80
R² Score: 0.89


#### Heatmap from RandomForest and Prophet

In [77]:
import folium
from folium.plugins import HeatMap

data = pd.read_csv("RForest + Prophet.csv")
# Create a map centered in India
m = folium.Map(location=[20.5937, 78.9629], zoom_start=5)

# Prepare heatmap data
heat_data = data[['latitude', 'longitude', 'final_peak_score']].values.tolist()

# Add heatmap with corrected gradient dictionary (keys should be strings)
gradient = {
    "0.2": "blue", "0.4": "lime", "0.6": "yellow", "0.8": "orange", "1": "red"
}
HeatMap(heat_data, gradient=gradient).add_to(m)

# Save map to an HTML file
m.save("india_heatmap.html")

print("Heatmap created! Open 'india_heatmap.html' to view it.")

Heatmap created! Open 'india_heatmap.html' to view it.


#### another try

In [2]:
import pandas as pd 
df = pd.read_csv("ridesharing_data.csv")


In [3]:
df['zone_id'].unique()

array([5, 1, 3, 4, 2])

In [1]:
import pandas as pd 
df = pd.read_csv("/home/zeal/my cute Hackathons/Great Bangalore Hackathon-March 2025/prediction/updated_data.csv")


# Select the relevant columns
selected_columns = df[['hour_of_day', 'supply_demand_ratio']]

# Save the selected columns to a new CSV file
selected_columns.to_csv('selected_data.csv', index=False)

print("New CSV file 'selected_data.csv' created successfully.")

New CSV file 'selected_data.csv' created successfully.
