In [1]:
import pandas as pd

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [2]:
# Importing datasets with driver positions found using the differeng algorithms
dbscan_df = pd.read_csv('dbscan_driver_positions.csv')
kmeans_df = pd.read_csv('kmeans_driver_positions.csv')

In [3]:
# Creating a column containing the name of the algorithm
dbscan_df['algorithm'] = "dbscan"
display(dbscan_df.head())
print("Number or rows in the dataframe:", len(dbscan_df))

Unnamed: 0,Day_of_Week_and_Hour,cluster,Lat,Lon,Day_of_Month,Week_Number,Hour,nb_pickups,algorithm
0,Friday_0_hrs,361,40.740167,-74.004333,18.0,16.0,0.0,3,dbscan
1,Friday_0_hrs,362,40.726833,-73.9941,18.0,16.0,0.0,3,dbscan
2,Friday_10_hrs,70,40.712233,-74.0136,4.0,14.0,10.0,3,dbscan
3,Friday_10_hrs,227,40.748733,-73.9723,11.0,15.0,10.0,3,dbscan
4,Friday_10_hrs,228,40.7243,-74.002933,11.0,15.0,10.0,3,dbscan


Number or rows in the dataframe: 687


In [4]:
kmeans_df['algorithm'] = "kmeans"
display(kmeans_df.head())
print("Number or rows in the dataframe:", len(kmeans_df))

Unnamed: 0,Day_of_Week_and_Hour,cluster,Lat,Lon,Day_of_Month,Week_Number,Hour,nb_pickups,algorithm
0,Friday_0_hrs,10,40.739733,-73.991467,20.333333,16.333333,0.0,3,kmeans
1,Friday_0_hrs,19,40.74205,-74.00485,14.5,15.5,0.0,2,kmeans
2,Friday_0_hrs,30,40.75805,-73.98575,18.0,16.0,0.0,2,kmeans
3,Friday_0_hrs,35,40.72615,-74.00495,11.0,15.0,0.0,2,kmeans
4,Friday_0_hrs,54,40.7315,-73.99075,18.0,16.0,0.0,2,kmeans


Number or rows in the dataframe: 1956


In [5]:
# Joining the dataframes
comparison_df = kmeans_df.merge(dbscan_df, how="outer")
print("Number or rows in the dataframe:", len(comparison_df))

Unnamed: 0,Day_of_Week_and_Hour,cluster,Lat,Lon,Day_of_Month,Week_Number,Hour,nb_pickups,algorithm
0,Friday_0_hrs,10,40.739733,-73.991467,20.333333,16.333333,0.0,3,kmeans
1,Friday_0_hrs,19,40.74205,-74.00485,14.5,15.5,0.0,2,kmeans
2,Friday_0_hrs,30,40.75805,-73.98575,18.0,16.0,0.0,2,kmeans
3,Friday_0_hrs,35,40.72615,-74.00495,11.0,15.0,0.0,2,kmeans
4,Friday_0_hrs,54,40.7315,-73.99075,18.0,16.0,0.0,2,kmeans


Unnamed: 0,Day_of_Week_and_Hour,cluster,Lat,Lon,Day_of_Month,Week_Number,Hour,nb_pickups,algorithm
2638,Wednesday_8_hrs,647,40.7697,-73.986667,30.0,18.0,8.0,3,dbscan
2639,Wednesday_9_hrs,310,40.755833,-73.986433,16.0,16.0,9.0,3,dbscan
2640,Wednesday_9_hrs,648,40.7238,-73.9912,30.0,18.0,9.0,3,dbscan
2641,Wednesday_9_hrs,649,40.764417,-73.9647,30.0,18.0,9.0,6,dbscan
2642,Wednesday_9_hrs,650,40.743183,-73.989517,30.0,18.0,9.0,6,dbscan


Number or rows in the dataframe: 2643


In [7]:
# Visualising driver positions on the same map for comparison
fig = px.scatter_mapbox(comparison_df, 
                        lat="Lat", lon="Lon", animation_frame='Day_of_Week_and_Hour', zoom=10,
                        color = "algorithm",
                        mapbox_style="open-street-map")

fig.show()

It looks like the KMeans algorithm is better suited to find clusters that are situated outside Manhattan (airports, suburbs) since DBScan seems to consider such pickup points as outliers. 