## Data exploration analysis

In [1]:
import pandas as pd
import numpy as np

# Read the CSV file
all_stations = pd.read_csv('cleaned_datasets/all_stations.csv')
all_trips = pd.read_csv('cleaned_datasets/all_trips.csv')

In [None]:
import matplotlib.pyplot as plt

# Convert 'started_at' to datetime if it's not already
all_trips['started_at'] = pd.to_datetime(all_trips['started_at'])

# Set figure size for the plots
plt.figure(figsize=(15, 6))

# Subplot 1: Number of Rides Over Time
plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
all_trips['started_at'].dt.date.value_counts().sort_index().plot(kind='line', color='blue')
plt.title('Number of Rides Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Rides')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Subplot 2: Number of Rides by Hour
plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
all_trips['started_at'].dt.hour.value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Number of Rides by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Rides')

# Save the plot as an image file
plt.savefig('./outputs/rides_over_time.png', format='png', dpi=300, bbox_inches='tight')

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'started_at' to datetime if it's not already
all_trips['started_at'] = pd.to_datetime(all_trips['started_at'])

# Create a new column for the day of the week
all_trips['day_of_week'] = all_trips['started_at'].dt.day_name()

# Define a function to categorize days into weekdays and weekends
def categorize_day(day):
    if day in ['Saturday', 'Sunday']:
        return 'Weekend'
    else:
        return 'Weekday'

# Apply the function to create a new column for categorization
all_trips['day_type'] = all_trips['day_of_week'].apply(categorize_day)

# Count the number of rides for weekdays and weekends
rides_by_day_type = all_trips['day_type'].value_counts()

# Plotting the results
plt.figure(figsize=(8, 5))
rides_by_day_type.plot(kind='bar', color=['blue', 'orange'])
plt.title('Number of Rides on Weekdays vs Weekends')
plt.xlabel('Day Type')
plt.ylabel('Number of Rides')
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability

# Save the plot as an image file
plt.savefig('./outputs/rides_weekdays_vs_weekends.png', format='png', dpi=300)

# Show the plot
plt.tight_layout()
plt.show()


In [4]:
all_trips.drop(columns=['day_of_week','day_type'], inplace=True)

In [None]:
import matplotlib.pyplot as plt

# Get the counts for start and end stations
start_station_counts = all_trips['start_station_id'].value_counts()
end_station_counts = all_trips['end_station_id'].value_counts()

# Create subplots: 1 row, 2 columns
fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # Adjust figsize as needed

# Plot the top 10 start stations
start_station_counts.head(10).plot(kind='bar', ax=axes[0])  # Use ax parameter to specify subplot
axes[0].set_title('Top 10 Start Stations')
axes[0].set_xlabel('Station ID')
axes[0].set_ylabel('Number of Rides')

# Plot the top 10 end stations
end_station_counts.head(10).plot(kind='bar', ax=axes[1])  # Use ax parameter to specify subplot
axes[1].set_title('Top 10 End Stations')
axes[1].set_xlabel('Station ID')
axes[1].set_ylabel('Number of Rides')

# Save the plot as an image file
plt.savefig('./outputs/top10_stations.png', format='png', dpi=300, bbox_inches='tight')

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Get the counts for start and end stations
start_station_counts = all_trips['start_station_id'].value_counts()
end_station_counts = all_trips['end_station_id'].value_counts()

# Create subplots: 1 row, 2 columns
fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # Adjust figsize as needed

# Plot the top 10 start stations
start_station_counts.tail(10).plot(kind='bar', ax=axes[0])  # Use ax parameter to specify subplot
axes[0].set_title('Bottom 10 Start Stations')
axes[0].set_xlabel('Station ID')
axes[0].set_ylabel('Number of Rides')

# Plot the top 10 end stations
end_station_counts.tail(10).plot(kind='bar', ax=axes[1])  # Use ax parameter to specify subplot
axes[1].set_title('Bottom 10 End Stations')
axes[1].set_xlabel('Station ID')
axes[1].set_ylabel('Number of Rides')

# Save the plot as an image file
plt.savefig('./outputs/bottom10_stations.png', format='png', dpi=300, bbox_inches='tight')

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
# Assuming start_station_counts is already defined

# Get the station IDs from start_station_counts
station_ids = start_station_counts.index

# Create a mapping of station_id to station_name
station_name_mapping = dict(zip(all_stations['station_id'], all_stations['station_name']))

# Create a Series to map station IDs to names
mapped_station_names = start_station_counts.index.map(station_name_mapping)

# Create a DataFrame with station IDs, names, and ride counts
result = pd.DataFrame({
    'station_id': start_station_counts.index,
    'station_name': mapped_station_names,
    'ride_count': start_station_counts.values
})

# Sort the result by ride_count in descending order
sorted_result = result.sort_values(by='ride_count', ascending=False)

# Get the top 10 and bottom 10 stations
top_stations = sorted_result.head(10)
bottom_stations = sorted_result.tail(10)

# Combine the top and bottom stations into a single DataFrame
combined_result = pd.concat([top_stations, bottom_stations])

print(combined_result)


In [None]:
# Assuming start_station_counts is already defined

# Get the station IDs from start_station_counts
station_ids = end_station_counts.index

# Create a mapping of station_id to station_name
station_name_mapping = dict(zip(all_stations['station_id'], all_stations['station_name']))

# Create a Series to map station IDs to names
mapped_station_names = start_station_counts.index.map(station_name_mapping)

# Create a DataFrame with station IDs, names, and ride counts
result = pd.DataFrame({
    'station_id': start_station_counts.index,
    'station_name': mapped_station_names,
    'ride_count': start_station_counts.values
})

# Sort the result by ride_count in descending order
sorted_result = result.sort_values(by='ride_count', ascending=False)

# Get the top 10 and bottom 10 stations
top_stations = sorted_result.head(10)
bottom_stations = sorted_result.tail(10)

# Combine the top and bottom stations into a single DataFrame
combined_result = pd.concat([top_stations, bottom_stations])

print(combined_result)


In [None]:
all_trips.head()

In [None]:
# Assuming all_trips is your DataFrame and 'ride_id' is the column with the string IDs
all_trips['ride_id'] = pd.factorize(all_trips['ride_id'])[0]

# Convert ride_id to a hash integer
all_trips['ride_id'] = all_trips['ride_id'].apply(lambda x: hash(x) % 10**8)  # Modulo to keep it in a manageable range

# Convert the 'started_at' and 'ended_at' columns to datetime
all_trips['started_at'] = pd.to_datetime(all_trips['started_at'])
all_trips['ended_at'] = pd.to_datetime(all_trips['ended_at'])

# Create and transform 'day_type'
all_trips['day_type'] = all_trips['started_at'].dt.dayofweek
all_trips['day_type'] = all_trips['day_type'].apply(lambda x: 1 if x >= 5 else 0)

correlation_matrix = all_trips.corr()
import seaborn as sns

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()



In [None]:
# Filter out rows where distance is 0 (start and end at the same station)
filtered_trips = all_trips[all_trips['distance'] > 0]

# Convert ride duration from seconds to minutes
filtered_trips['ride_duration_minutes'] = filtered_trips['ride_duration'] / 60

# Create scatter plot with filtered data
plt.scatter(filtered_trips['distance'], filtered_trips['ride_duration_minutes'], alpha=0.5)
plt.title('Distance vs. Ride Duration (in Minutes)')
plt.xlabel('Distance (Kilometers)')
plt.ylabel('Ride Duration (minutes)')

# Save the plot as an image file
plt.savefig('./outputs/distance_vs_duration.png', format='png', dpi=300)

plt.show()
