In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN

In [2]:
df = pd.read_csv(r"uber_data\uber-raw-data-sep14.csv")

In [3]:
len(df)

1028136

In [4]:
df = df.sample(n=5000)

In [5]:
df = df.reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/2/2014 15:06:00,40.7476,-74.0079,B02617
1,9/3/2014 22:23:00,40.7461,-73.9561,B02764
2,9/27/2014 12:56:00,40.7455,-74.0085,B02617
3,9/23/2014 8:48:00,40.7649,-73.9824,B02764
4,9/27/2014 13:27:00,40.7462,-73.9905,B02764


In [7]:
df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base
4995,9/18/2014 18:00:00,40.7438,-73.986,B02617
4996,9/6/2014 17:39:00,40.7106,-74.0065,B02617
4997,9/27/2014 10:06:00,40.6671,-73.9942,B02617
4998,9/30/2014 16:21:00,40.7378,-73.9885,B02617
4999,9/15/2014 4:39:00,40.7565,-73.9199,B02617


In [8]:
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

In [9]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"], format='%m/%d/%Y %H:%M:%S')

In [10]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-09-02 15:06:00,40.7476,-74.0079,B02617
1,2014-09-03 22:23:00,40.7461,-73.9561,B02764
2,2014-09-27 12:56:00,40.7455,-74.0085,B02617
3,2014-09-23 08:48:00,40.7649,-73.9824,B02764
4,2014-09-27 13:27:00,40.7462,-73.9905,B02764


In [11]:
df = df.sort_values("Date/Time").reset_index(drop=True)

In [12]:
df.head()
# value counts par tranche horaire

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-09-01 00:19:00,40.7674,-73.9824,B02598
1,2014-09-01 00:43:00,40.7299,-73.9885,B02598
2,2014-09-01 00:55:00,40.7194,-73.9974,B02617
3,2014-09-01 00:59:00,40.7173,-74.0018,B02617
4,2014-09-01 01:40:00,40.7497,-73.9952,B02764


In [13]:
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron"
)

fig.show()

In [14]:
df.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base'], dtype='object')

In [15]:
df.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [18]:
numeric_features = ['Lat', 'Lon'] #colonnes quantitatives dans X
numeric_transformer = StandardScaler()

#categorical_features = ['Day_of_Week'] # colonnes catégorielles dans X
#categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])

In [36]:
X = preprocessor.fit_transform(df)

In [45]:
db = DBSCAN(eps=0.5, min_samples=3)

db.fit(X)
prediction_1 = db.labels_

In [46]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [47]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print("Estimated number of clusters: %d" % n_clusters_)

Estimated number of clusters: 9


In [50]:
df["cluster"] = labels

In [51]:
fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="open-street-map")
fig.show()