In [3]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN

In [4]:
url = 'https://uber-28-02-2023.s3.eu-west-3.amazonaws.com/uber-raw-data-apr14.csv'
df = pd.read_csv(url)

In [5]:
len(df)

564516

In [6]:
df = df.sample(n=5000)

In [7]:
df = df.reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/13/2014 0:35:00,40.7528,-74.0261,B02598
1,4/5/2014 0:46:00,40.7161,-74.0086,B02598
2,4/4/2014 15:08:00,40.717,-74.039,B02598
3,4/9/2014 7:11:00,40.7427,-73.9943,B02617
4,4/17/2014 13:34:00,40.7564,-73.978,B02598


In [9]:
df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base
4995,4/21/2014 15:34:00,40.6705,-73.9444,B02598
4996,4/24/2014 15:48:00,40.7607,-73.9716,B02598
4997,4/1/2014 8:50:00,40.7336,-73.992,B02682
4998,4/24/2014 22:27:00,40.7278,-74.0015,B02682
4999,4/28/2014 3:56:00,40.6767,-73.9993,B02682


In [10]:
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

In [11]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"], format='%m/%d/%Y %H:%M:%S')

In [12]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-13 00:35:00,40.7528,-74.0261,B02598
1,2014-04-05 00:46:00,40.7161,-74.0086,B02598
2,2014-04-04 15:08:00,40.717,-74.039,B02598
3,2014-04-09 07:11:00,40.7427,-73.9943,B02617
4,2014-04-17 13:34:00,40.7564,-73.978,B02598


In [13]:
df = df.sort_values("Date/Time").reset_index(drop=True)

In [14]:
df.head()
# value counts par tranche horaire

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-01 00:43:00,40.7019,-74.0112,B02598
1,2014-04-01 04:14:00,40.7139,-74.0066,B02682
2,2014-04-01 05:11:00,40.6409,-73.9607,B02598
3,2014-04-01 05:14:00,40.7583,-73.9607,B02682
4,2014-04-01 05:19:00,40.7111,-74.0052,B02598


In [15]:
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron"
)

fig.show()

In [16]:
df.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base'], dtype='object')

In [17]:
df.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [18]:
numeric_features = ['Lat', 'Lon'] #colonnes quantitatives dans X
numeric_transformer = StandardScaler()

#categorical_features = ['Day_of_Week'] # colonnes catégorielles dans X
#categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])

In [19]:
X = preprocessor.fit_transform(df)

In [20]:
db = DBSCAN(eps=0.5, min_samples=3)

db.fit(X)
prediction_1 = db.labels_

In [21]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [22]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print("Estimated number of clusters: %d" % n_clusters_)

Estimated number of clusters: 4


In [23]:
df["cluster"] = labels

In [24]:
fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="open-street-map")
fig.show()