In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN

In [2]:
url = 'https://uber-28-02-2023.s3.eu-west-3.amazonaws.com/uber-raw-data-apr14.csv'
df = pd.read_csv(url)

In [3]:
len(df)

564516

In [4]:
df = df.sample(n=5000)

In [5]:
df = df.reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/24/2014 17:20:00,40.6907,-73.9818,B02598
1,4/24/2014 8:59:00,40.7188,-73.9885,B02598
2,4/25/2014 9:12:00,40.7388,-74.0091,B02598
3,4/17/2014 18:23:00,40.7232,-73.9975,B02598
4,4/17/2014 14:28:00,40.7461,-73.9839,B02598


In [7]:
df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base
4995,4/10/2014 7:47:00,40.7373,-73.9843,B02682
4996,4/9/2014 14:05:00,40.7467,-73.9437,B02512
4997,4/24/2014 21:39:00,40.7243,-73.9987,B02682
4998,4/17/2014 22:23:00,40.7387,-73.9888,B02617
4999,4/17/2014 9:46:00,40.7515,-73.9941,B02512


In [8]:
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

In [9]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"], format='%m/%d/%Y %H:%M:%S')

In [10]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-24 17:20:00,40.6907,-73.9818,B02598
1,2014-04-24 08:59:00,40.7188,-73.9885,B02598
2,2014-04-25 09:12:00,40.7388,-74.0091,B02598
3,2014-04-17 18:23:00,40.7232,-73.9975,B02598
4,2014-04-17 14:28:00,40.7461,-73.9839,B02598


In [11]:
df = df.sort_values("Date/Time").reset_index(drop=True)

In [12]:
df.head()
# value counts par tranche horaire

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-01 01:11:00,40.7864,-73.9519,B02682
1,2014-04-01 05:10:00,40.7231,-73.9766,B02617
2,2014-04-01 06:10:00,40.73,-74.0069,B02617
3,2014-04-01 06:13:00,40.7198,-74.0071,B02682
4,2014-04-01 06:17:00,40.7651,-74.082,B02512


In [13]:
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron"
)

fig.show()

In [14]:
df.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base'], dtype='object')

In [15]:
df.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [16]:
numeric_features = ['Lat', 'Lon'] #colonnes quantitatives dans X
numeric_transformer = StandardScaler()

#categorical_features = ['Day_of_Week'] # colonnes catégorielles dans X
#categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])

In [17]:
X = preprocessor.fit_transform(df)

In [18]:
#db = DBSCAN(eps=0.5, min_samples=3)
db = DBSCAN(eps=0.3, min_samples=6)

db.fit(X)
prediction_1 = db.labels_

In [19]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [20]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print("Estimated number of clusters: %d" % n_clusters_)

Estimated number of clusters: 6


In [21]:
df["cluster"] = labels

In [22]:
fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="open-street-map")
fig.show()