In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde

# Read the CSV file
df = pd.read_csv('../datasets/all_trips.csv')
df.head()

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,ride_duration,distance
0,322BD23D287743ED,2020-08-20 18:08:14,2020-08-20 18:17:51,329.0,141.0,9.616667,1.884479
1,2A3AEF1AB9054D8B,2020-08-27 18:46:04,2020-08-27 19:54:51,168.0,168.0,68.783333,0.0
2,67DC1D133E8B5816,2020-08-26 19:44:14,2020-08-26 21:53:07,195.0,44.0,128.883333,0.679841
3,C79FBBD412E578A7,2020-08-27 12:05:41,2020-08-27 12:53:45,81.0,47.0,48.066667,0.573236
4,13814D3D661ECADB,2020-08-27 16:49:02,2020-08-27 16:59:49,658.0,658.0,10.783333,0.0


In [16]:
# Calculate Probability Distributions

# Calculate the probability of each starting station
start_station_probs = df['start_station_id'].value_counts(normalize=True)

# Calculate the conditional probabilities of destinations given the starting station
end_station_probs = df.groupby('start_station_id')['end_station_id'].value_counts(normalize=True)


In [17]:
# Sample Starting and Ending Stations

# Sample a starting station
sampled_start_station = np.random.choice(start_station_probs.index, p=start_station_probs.values)

# Sample an ending station based on the sampled starting station
sampled_end_station_probs = end_station_probs[sampled_start_station]
sampled_end_station = np.random.choice(sampled_end_station_probs.index, p=sampled_end_station_probs.values)

In [None]:
# Convert 'started_at' to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
# Extract the date and hour from 'started_at'
df['date'] = df['started_at'].dt.date
# Extract the date and 15-minute intervals
df['15_min_interval'] = df['started_at'].dt.floor('15min')
# Define the start of the day
start_of_day = pd.to_datetime("2020-08-20 00:00:00")

In [19]:
# Group by date and 15-minute intervals, and count the number of departures
departures_per_interval = df.groupby(['date', '15_min_interval']).size().reset_index(name='departures')

# Extract only the time part for 15-minute intervals
departures_per_interval['time'] = departures_per_interval['15_min_interval'].dt.time

# Group by time (15-minute intervals) and compute the average departures across all days
avg_departures_per_15min = departures_per_interval.groupby('time')['departures'].mean()

# Fit a Kernel Density Estimate (KDE) to the average departures
kde = gaussian_kde(avg_departures_per_15min)

In [20]:

# Calculate conditional distributions of durations by start and end station pairs
duration_distributions = df.groupby(['start_station_id', 'end_station_id'])['ride_duration']

# Fit KDEs for each start-end pair
duration_kdes = {}
for (start, end), durations in duration_distributions:
    duration_kdes[(start, end)] = gaussian_kde(durations)

# After predicting start and end stations
predicted_pair = (sampled_start_station, sampled_end_station)

if predicted_pair in duration_kdes:
    ride_duration = duration_kdes[predicted_pair].resample(1)[0][0]  # Sample duration in minutes
else:
    # Fallback for unseen pairs (e.g., use global mean or a random value)
    ride_duration = df['ride_duration'].mean()

# Ensure non-negative duration
ride_duration = max(ride_duration, 0)


ValueError: `dataset` input should have multiple elements.

In [None]:
# Calculate the end time of the ride
ride_duration_timedelta = pd.Timedelta(minutes=ride_duration)
end_ride_time = next_ride_time + ride_duration_timedelta

# Print the full prediction
print(f"Predicted next ride will occur at: {next_ride_time}")
print(f"Starting station: {sampled_start_station}")
print(f"Ending station: {sampled_end_station}")
print(f"Predicted ride duration: {ride_duration:.2f} minutes")
print(f"Predicted ride will end at: {end_ride_time}")