In [None]:
#importing necessary package
import pandas as pd
import numpy as np
import seaborn as sns
import folium
from folium.plugins import HeatMap
import matplotlib.pyplot as plt

In [None]:
# Extreme coordinates of San Jose
lat_min = 37.197467
lat_max = 37.501078
long_min = -122.104839
long_max = -121.702683

In [None]:
main_df = pd.read_csv("data.csv")
main_df.head()

In [None]:
main_df.info()

In [None]:
# filtering out san jose records
sanjose_df_copy = main_df[(main_df['start_station_latitude'] > lat_min) & (main_df['start_station_latitude'] < lat_max) & (main_df['start_station_longitude'] > long_min) & (main_df['end_station_longitude'] < long_max)]
sanjose_df_copy.info()

In [None]:
# percentage of missing values
sanjose_df_copy.isnull().sum()/len(sanjose_df_copy)*100

In [None]:
# dropping ride_id, start_station_id, end_station_id as they are not that important for the analysis
sanjose_df_copy.drop(['start_station_id','end_station_id'], axis=1, inplace=True)

In [None]:
sanjose_df_copy.isnull().sum()/len(sanjose_df_copy)*100

In [None]:
sanjose_df_copy.info()

In [None]:
# from the column start time and end time a no. of useful columns can be derrived
# converting the start_time and end_time to datime format for further analysis
sanjose_df_copy['start_time'] = pd.to_datetime(sanjose_df_copy['start_time'], format='%Y-%m-%d %H:%M:%S')
sanjose_df_copy['end_time'] = pd.to_datetime(sanjose_df_copy['end_time'], format='%Y-%m-%d %H:%M:%S')

In [None]:
sanjose_df_copy.info()

In [None]:
# Calculate ride durations
sanjose_df_copy['ride_duration'] = sanjose_df_copy['end_time'] - sanjose_df_copy['start_time']

# Convert ride durations into minutes
sanjose_df_copy['ride_duration'] = sanjose_df_copy['ride_duration'].dt.total_seconds().div(60)

In [None]:
# Extract day of the week for each ride from the 'started_at_datetime' value (Monday = 0, Tuesday = 1, etc.)
sanjose_df_copy['day_of_week'] = sanjose_df_copy['start_time'].dt.dayofweek

# Extract date (start) for each ride
sanjose_df_copy['day_of_month'] = sanjose_df_copy['start_time'].dt.day

# Extract hour (start) for each ride
sanjose_df_copy['start_hour'] = sanjose_df_copy['start_time'].dt.hour

sanjose_df_copy['month'] = sanjose_df_copy['start_time'].dt.month

sanjose_df_copy['year'] = sanjose_df_copy['start_time'].dt.year

# Map week days values
week_days = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday", 4:"Friday", 5:"Saturday", 6:"Sunday"}
sanjose_df_copy['day_of_week'] = sanjose_df_copy['day_of_week'].map(week_days)
week_days_num = {"Monday" : 0, "Tuesday" : 1, "Wednesday" : 2, "Thursday" : 3, "Friday" : 4, "Saturday" : 5, "Sunday" : 6}
sanjose_df_copy['day_of_week_num'] = sanjose_df_copy['day_of_week'].map(week_days_num)

In [None]:
sanjose_df_copy.info()

In [None]:
sanjose_df_copy['year'].value_counts()

In [None]:
# we can drop records which has ride duration less than a minute; since these could depict accidental rides and not important to our analysis
sanjose_df_copy = sanjose_df_copy.loc[sanjose_df_copy['ride_duration'] >= 1]
sanjose_df_copy.info()

In [None]:
# Removing records which started and ended at the same station; these records are of not that use for further annalysis
# Tolerance for considering coordinates to be the same, in degrees
tolerance = 0.0001

# Boolean mask for rows where the start and end coordinates are effectively the same
mask = (abs(sanjose_df_copy['start_station_latitude'] - sanjose_df_copy['end_station_latitude']) < tolerance) & \
       (abs(sanjose_df_copy['start_station_longitude'] - sanjose_df_copy['end_station_longitude']) < tolerance)

# Invert the mask to keep only the rows where the coordinates are different
sanjose_df_copy_filtered = sanjose_df_copy[~mask]

# Now sanjose_df_copy_filtered contains only the records where the start and end stations are different

In [None]:
sanjose_df_copy_filtered.info()

In [None]:
# Define the haversine function to calculate distances
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees).
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Assuming 'sanjose_df_copy_filtered' is your DataFrame
# Apply the haversine function to each row to calculate the distance in km
sanjose_df_copy_filtered['haversine_distance_km'] = sanjose_df_copy_filtered.apply(
    lambda row: haversine(row['start_station_longitude'], row['start_station_latitude'],
                          row['end_station_longitude'], row['end_station_latitude']), axis=1)

In [None]:
sanjose_df_copy_filtered.describe()

In [None]:
# Calculate Speed for each ride
sanjose_df_copy_filtered['ride_speed'] = sanjose_df_copy_filtered['haversine_distance_km'] / (sanjose_df_copy_filtered['ride_duration']) * 60

In [None]:
# observing from this the max ride duration is 1499.933 mins which is more than 24 hours which is an outlier and also the max speed is 56kmph which is highly unlikely in an urban area
# Remove 24+ hour rides and speeds over 30 KMPH
sanjose_df_copy_filtered = sanjose_df_copy_filtered[(sanjose_df_copy_filtered['ride_duration'] <= 1440) & (sanjose_df_copy_filtered['ride_speed'] <= 30)]
sanjose_df_copy_filtered.reset_index(drop=True)

# EDA

In [None]:
# Rideable type distribution

# Lyft-inspired color palette
lyft_palette = ["#FF00BF", "#AA00FF", "#FF66FF", "#6600FF", "#CC00FF"]

rideable_counts = sanjose_df_copy_filtered['rideable_type'].value_counts()

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

palette = sns.color_palette("bright")

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=rideable_counts.index, y=rideable_counts.values, palette=lyft_palette)

# Add the plot title and labels
plt.title('Distribution of Rideable Types', fontsize=16)
plt.xlabel('Rideable Type', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Show the plot
plt.show()

In [None]:
sanjose_df_copy_filtered['rideable_type'].value_counts()

In [None]:
# let's drop docked_type since it is only a very small fraction of the whole distribution
# Drop records where the rideable_type is 'docked_bike' directly within the original DataFrame
sanjose_df_copy_filtered = sanjose_df_copy_filtered[sanjose_df_copy_filtered['rideable_type'] != 'docked_bike']


In [None]:
# User type distribution
# Count the frequency of each user type
user_type_counts = sanjose_df_copy_filtered['member_casual'].value_counts()

# Create the bar plot for user type distribution
plt.figure(figsize=(8, 5))
sns.barplot(x=user_type_counts.index, y=user_type_counts.values, palette=lyft_palette)
plt.title('Distribution of User Types', fontsize=16)
plt.xlabel('User Type', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()

In [None]:
# distribution of rideable types by user type
plt.figure(figsize=(10, 6))
sns.countplot(x='rideable_type', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)
plt.title('Distribution of Rideable Types by User Type', fontsize=16)
plt.xlabel('Rideable Type', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='User Type')
plt.show()

In [None]:
# ount plot with hours and user type
plt.figure(figsize=(14, 7))
sns.countplot(x='start_hour', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)
plt.title('Hourly Rides by Customer Type', fontsize=16)
plt.xlabel('Hour of the Day', fontsize=14)
plt.ylabel('Number of Rides', fontsize=14)
plt.legend(title='User Type', loc='upper left')
plt.xticks(ticks=range(0, 24), labels=[f"{hour}:00" for hour in range(0, 24)])
plt.show()

In [None]:
# Now, we will plot the count of rides by week number, separated by rider type
plt.figure(figsize=(15, 7))
sns.countplot(x='day_of_week', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)
plt.title('Weekly Rides by Rider Type', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Number of Rides', fontsize=14)
plt.legend(title='Rider Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Now, we will plot the count of rides by week number, separated by rider type
plt.figure(figsize=(15, 7))
sns.countplot(x='month', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)
plt.title('monthly Rides by Rider Type', fontsize=16)
plt.xlabel('Months', fontsize=14)
plt.ylabel('Number of Rides', fontsize=14)
plt.legend(title='Rider Type')
plt.xticks(rotation=45)  # Rotate x-ticks if there are many weeks and they overlap
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='day_of_month', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)

# Set the title and labels
plt.title('Daily Rides by Rider Type', fontsize=16)
plt.xlabel('Day of the month', fontsize=14)
plt.ylabel('Number of Rides', fontsize=14)
plt.legend(title='Rider Type')

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='year', hue='member_casual', data=sanjose_df_copy_filtered, palette=lyft_palette)

# Set the title and labels
plt.title('Yearly Rides by Rider Type', fontsize=16)
plt.xlabel('Years', fontsize=14)
plt.ylabel('Number of Rides', fontsize=14)
plt.legend(title='Rider Type')

# Show the plot
plt.show()

In [None]:
#Make the list of Lat an Lng
start_station_latitude = sanjose_df_copy_filtered.start_station_latitude.tolist()
start_station_longitude = sanjose_df_copy_filtered.start_station_longitude.tolist()
end_station_latitude = sanjose_df_copy_filtered.end_station_latitude.tolist()
end_station_longitude = sanjose_df_copy_filtered.end_station_longitude.tolist()
lat, lng = (start_station_latitude + end_station_latitude), (start_station_longitude + end_station_longitude)

#Create the Map
map = folium.Map(
    location=[37.335480,-121.893028],
    tiles='openstreetmap',
    zoom_start=13
)

HeatMap(list(zip(lat, lng))).add_to(map)
map

In [None]:
#top ten popular start stations

sanjose_df_copy_filtered['start_station_name'].value_counts()[:10]

In [None]:
#top ten popular end stations

sanjose_df_copy_filtered['end_station_name'].value_counts()[:10]

In [None]:
sanjose_df_copy_filtered_no_nan = sanjose_df_copy_filtered[~sanjose_df_copy_filtered['end_station_name'].isna()]
sanjose_df_copy_filtered_no_nan = sanjose_df_copy_filtered_no_nan[~sanjose_df_copy_filtered_no_nan['end_station_name'].isna()]
sanjose_df_copy_filtered_no_nan = sanjose_df_copy_filtered.loc[sanjose_df_copy_filtered['start_station_name'] != sanjose_df_copy_filtered['end_station_name']]

# Group rows by Start Station Name and End Station Name
popular_stations = sanjose_df_copy_filtered_no_nan.groupby(['start_station_name','end_station_name','start_station_latitude','start_station_longitude','end_station_latitude','end_station_longitude']).size().sort_values(ascending=False)
popular_stations = popular_stations[:10].to_frame().reset_index()    

# Create a list of popular routes
l = []
for x in range(0, len(popular_stations)):
    l.append([(popular_stations['start_station_latitude'][x], popular_stations['start_station_longitude'][x]), (popular_stations['end_station_latitude'][x], popular_stations['end_station_longitude'][x])])

# Plot Popular Routes on a Map
popular_map = folium.Map(location=[37.3425,-121.893028], tiles='openstreetmap', zoom_start=14)

folium.PolyLine(locations = l,
                line_opacity = 0.1).add_to(popular_map)

for i in range(0,9):
   folium.Marker(
      location=[popular_stations['start_station_latitude'][i], popular_stations['start_station_longitude'][i]],
      popup=popular_stations['start_station_name'][i],
   ).add_to(popular_map)

for j in range(0,9):
   folium.Marker(
      location=[popular_stations['end_station_latitude'][j], popular_stations['end_station_longitude'][j]],
      popup=popular_stations['end_station_name'][j],
   ).add_to(popular_map)

popular_map

In [None]:
sanjose_df_copy_filtered['start_station_name'].nunique()

sanjose_df_copy_filtered['end_station_name'].nunique()

In [None]:
sanjose_df_copy_filtered.describe()

In [None]:
sanjose_df_copy_filtered.isnull().sum()/len(sanjose_df_copy)*100

In [None]:
sanjose_df_copy_filtered.info()

# Experimentation and Value Proposition

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

# Convert latitude and longitude to radians for use in haversine metric
start_coords = np.radians(sanjose_df_copy_filtered[['start_station_latitude', 'start_station_longitude']].values)
end_coords = np.radians(sanjose_df_copy_filtered[['end_station_latitude', 'end_station_longitude']].values)

# Define the epsilon value in radians, assuming you want to use 1 km as the radius
# 1 km in degrees is approximately 0.008998919 degrees. Now convert this to radians.0.0044994595(half of 1km i.e 0.5km)
eps_rad = np.radians(0.0044994595)

# Apply DBSCAN

db = DBSCAN(eps=eps_rad, min_samples=500, metric='haversine')
start_cluster_labels = db.fit_predict(start_coords)
end_cluster_labels = db.fit_predict(end_coords)

# Add the cluster labels to your dataframe
sanjose_df_copy_filtered['start_cluster'] = start_cluster_labels
sanjose_df_copy_filtered['end_cluster'] = end_cluster_labels

In [None]:
# Find unique cluster labels for start and end clusters
unique_start_clusters = np.unique(start_cluster_labels)
unique_end_clusters = np.unique(end_cluster_labels)

# Generate custom names for each unique cluster label
# Note: This includes handling the noise label (-1) from DBSCAN, if present
start_cluster_names = {cluster_label: f"Cluster_{chr(65+i)}" for i, cluster_label in enumerate(unique_start_clusters)}
end_cluster_names = {cluster_label: f"Cluster_{chr(65+i)}" for i, cluster_label in enumerate(unique_end_clusters)}

# If you have noise (-1), you might want to assign it a specific name
start_cluster_names[-1] = "Noise"
end_cluster_names[-1] = "Noise"


In [None]:
# Apply the custom names to the 'start_cluster' and 'end_cluster' columns
sanjose_df_copy_filtered['start_cluster'] = sanjose_df_copy_filtered['start_cluster'].map(start_cluster_names)
sanjose_df_copy_filtered['end_cluster'] = sanjose_df_copy_filtered['end_cluster'].map(end_cluster_names)


In [None]:
start_clusters = sanjose_df_copy_filtered['start_cluster'].value_counts()
start_clusters

In [None]:
end_clusters = sanjose_df_copy_filtered['end_cluster'].value_counts()
end_clusters

In [None]:
sanjose_df_copy_filtered

### cross-cluster analysis


In [None]:
# Count the number of trips between clusters
trip_counts_between_clusters = sanjose_df_copy_filtered.groupby(['start_cluster', 'end_cluster']).size().reset_index(name='trip_count')
trip_counts_between_clusters


In [None]:
# Sort the trip counts to find the most frequent trips
sorted_trip_counts = trip_counts_between_clusters.sort_values(by='trip_count', ascending=False)

# Display the top 10 most frequent trips between clusters
print(sorted_trip_counts.head(20))


In [None]:
# Pivot the data to create a matrix suitable for a heatmap
trip_matrix = trip_counts_between_clusters.pivot(index='start_cluster', columns='end_cluster', values='trip_count')


# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(trip_matrix, annot=True, fmt=".0f", cmap="YlGnBu")
plt.title('Trips Between Clusters')
plt.xlabel('End Cluster')
plt.ylabel('Start Cluster')
plt.show()

In [None]:
import branca
from matplotlib import cm
from matplotlib.colors import to_hex

map_center = [sanjose_df_copy_filtered['start_station_latitude'].mean(), sanjose_df_copy_filtered['start_station_longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=13)

cmap = cm.get_cmap('Set1')

# Define the color mapping function
def color_producer(cluster):
    if cluster == -1:
        return '#000000' # Color for noise
    else:
        return to_hex(cmap(cluster))

# Add markers to the map
for idx, row in sanjose_df_copy_filtered.iterrows():
    if row['start_cluster'] != -1:
        folium.CircleMarker(
            location=[row['start_station_latitude'], row['start_station_longitude']],
            radius=5,
            color=color_producer(row['start_cluster']),
            fill=True
        ).add_to(m)

# Create a legend
legend_html = '''
<div style="position: fixed; 
     bottom: 50px; left: 50px; width: 150px; height: 90px; 
     border:2px solid grey; z-index:9999; font-size:14px;
     ">&nbsp; Cluster Legend <br>
     &nbsp; Cluster A &nbsp; <i class="fa fa-circle" style="color:{}"></i><br>
     &nbsp; Cluster O &nbsp; <i class="fa fa-circle" style="color:{}"></i>
</div>'''.format(color_producer(0), color_producer(1))  # Add more lines as needed

# Add the legend to the map
m.get_root().html.add_child(folium.Element(legend_html))

# Show the map
m



In [None]:
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt

coords = sanjose_df_copy_filtered[['start_station_latitude', 'start_station_longitude']].apply(np.radians).values

# Use NearestNeighbors to find the average distance to the k-th nearest neighbor
nearest_neighbors = NearestNeighbors(n_neighbors=500, metric='haversine')
neighbors = nearest_neighbors.fit(coords)
distances, indices = neighbors.kneighbors(coords)

# Sort distance values and plot them to find the best 'eps'
distances = np.sort(distances, axis=0)
distances = distances[:, 1]  # Take the distance to the 2nd nearest neighbor
plt.plot(distances)

In [None]:
import folium
from branca.element import Template, MacroElement
from matplotlib import cm
from matplotlib.colors import to_hex

# Define the colormap
# Get the number of unique clusters (excluding noise)
unique_clusters = set(sanjose_df_copy_filtered[sanjose_df_copy_filtered['start_cluster'] != -1]['start_cluster'])
cmap = cm.get_cmap('Set1', len(unique_clusters))

# Define the color mapping function
def color_producer(cluster_label):
    if cluster_label == -1:
        return '#000000'  # Noise
    else:
        # Normalize cluster label to [0, len(unique_clusters)-1] for color mapping
        return to_hex(cmap((cluster_label - min(unique_clusters)) / (max(unique_clusters) - min(unique_clusters))))

# Create a Folium map
map_center = [sanjose_df_copy_filtered['start_station_latitude'].mean(), sanjose_df_copy_filtered['start_station_longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=13)

# Add markers to the map, excluding noise
for idx, row in sanjose_df_copy_filtered.iterrows():
    if row['start_cluster'] != -1:  # Exclude noise
        folium.CircleMarker(
            location=[row['start_station_latitude'], row['start_station_longitude']],
            radius=5,
            color=color_producer(row['start_cluster']),
            fill=True
        ).add_to(m)

# Start of legend HTML code
legend_html = '''
<div style="position: fixed; 
     bottom: 50px; left: 50px; width: 150px; height: {}px; 
     border:2px solid grey; z-index:9999; font-size:14px;
     background-color: white;">
     &nbsp; Cluster Legend <br>
'''.format(len(unique_clusters) * 25 + 30)  # Calculate height based on number of clusters

# Dynamically add cluster colors to the legend
for cluster_label in unique_clusters:
    color = color_producer(cluster_label)
    legend_html += f'&nbsp; Cluster {cluster_label} &nbsp; <i class="fa fa-circle fa-lg" style="color:{color}"></i><br>'

# Close the legend HTML code
legend_html += '</div>'

# Add the legend to the map
legend = MacroElement()
legend._template = Template(legend_html)

m.get_root().add_child(legend)

# Show the map
m


In [None]:
# Calculate summary statistics for ride duration within each cluster
cluster_summary_ride_duration = sanjose_df_copy_filtered.groupby('start_cluster')['ride_duration'].describe()

# Calculate summary statistics for the start hour within each cluster
cluster_summary_start_hour = sanjose_df_copy_filtered.groupby('start_cluster')['start_hour'].describe()

# Calculate the distribution of rides across days of the week for each cluster
cluster_summary_day_of_week = sanjose_df_copy_filtered.groupby('start_cluster')['day_of_week'].value_counts().unstack(fill_value=0)

# Display the calculated summaries
print("Ride Duration Statistics by Cluster:")
print(cluster_summary_ride_duration)

print("\nStart Hour Statistics by Cluster:")
print(cluster_summary_start_hour)

print("\nRide Counts by Day of the Week and Cluster:")
print(cluster_summary_day_of_week)


In [None]:

# Ride Duration Statistics by Cluster Visualization
plt.figure(figsize=(12, 6))
sns.boxplot(data=sanjose_df_copy_filtered, x='start_cluster', y='ride_duration')
plt.title('Ride Duration Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Ride Duration (minutes)')
plt.show()

# Start Hour Statistics by Cluster Visualization
plt.figure(figsize=(12, 6))
sns.barplot(data=cluster_summary_start_hour['count'].reset_index(), x='start_cluster', y='count')
plt.title('Start Hour Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

# Ride Counts by Day of the Week and Cluster Visualization
plt.figure(figsize=(12, 6))
sns.heatmap(cluster_summary_day_of_week, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Ride Counts by Day of the Week and Cluster')
plt.xlabel('Day of the Week')
plt.ylabel('Cluster')
plt.show()

In [None]:
# Calculate the mean ride duration for each cluster for visualization
mean_ride_duration_per_cluster = sanjose_df_copy_filtered.groupby('start_cluster')['ride_duration'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=mean_ride_duration_per_cluster, x='start_cluster', y='ride_duration', palette=lyft_palette)
plt.title('Average Ride Duration by Cluster')
plt.xlabel('Clusters')
plt.ylabel('Average Ride Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

In [None]:
start_hour_distribution = sanjose_df_copy_filtered.groupby(['start_cluster', 'start_hour']).size().reset_index(name='count')

# Visualization
plt.figure(figsize=(14, 8))
sns.barplot(data=start_hour_distribution, x='start_hour', y='count', hue='start_cluster', palette='viridis')
plt.title('Start Hour Distribution by Cluster')
plt.xlabel('Start Hour')
plt.ylabel('Count')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=sanjose_df_copy_filtered, x='start_cluster', hue='rideable_type', palette=lyft_palette)
plt.title('Distribution of Rideable Types by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='Rideable Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)  
plt.tight_layout() 
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=sanjose_df_copy_filtered, x='start_cluster', hue='member_casual', palette=lyft_palette)
plt.title('Distribution of User Types by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='User Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)  
plt.tight_layout() 
plt.show()