In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN

In [2]:
df = pd.read_csv(r"uber_data\uber-raw-data-sep14.csv")

In [3]:
len(df)

1028136

In [4]:
df = df.sample(n=5000)

In [5]:
df = df.reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/2/2014 20:53:00,40.7182,-73.9864,B02598
1,9/21/2014 22:55:00,40.7408,-74.0076,B02617
2,9/22/2014 22:07:00,40.7066,-73.8298,B02764
3,9/13/2014 15:57:00,40.775,-73.9137,B02764
4,9/8/2014 21:30:00,40.7518,-73.9861,B02598


In [7]:
df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base
4995,9/13/2014 17:47:00,40.7823,-73.9804,B02617
4996,9/29/2014 13:26:00,40.7206,-73.9971,B02617
4997,9/17/2014 21:09:00,40.7614,-73.9801,B02617
4998,9/27/2014 12:31:00,40.7499,-73.9836,B02598
4999,9/30/2014 6:57:00,40.7408,-73.9974,B02682


In [8]:
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

In [9]:
df["Base"].value_counts()

B02617    1863
B02598    1172
B02682     941
B02764     856
B02512     168
Name: Base, dtype: int64

In [10]:
#'%H:%M:%S' 

In [11]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"], format='%m/%d/%Y %H:%M:%S')

In [12]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-09-02 20:53:00,40.7182,-73.9864,B02598
1,2014-09-21 22:55:00,40.7408,-74.0076,B02617
2,2014-09-22 22:07:00,40.7066,-73.8298,B02764
3,2014-09-13 15:57:00,40.775,-73.9137,B02764
4,2014-09-08 21:30:00,40.7518,-73.9861,B02598


In [13]:
df = df.sort_values("Date/Time").reset_index(drop=True)

In [14]:
#df["Year"] = pd.DatetimeIndex(df["Date/Time"]).year
#df["Month"] = pd.DatetimeIndex(df["Date/Time"]).month
df["Day"] = pd.DatetimeIndex(df["Date/Time"]).day
df['Day_of_Week'] = pd.to_datetime(df["Date/Time"]).dt.day_name()


In [15]:
df['Hour'] = df["Date/Time"].dt.hour
df['Minute'] = df["Date/Time"].dt.minute

In [16]:
df = df.drop(['Base', 'Date/Time'], axis=1)

In [17]:
len(df)

5000

In [18]:
df.head()
# value counts par tranche horaire

Unnamed: 0,Lat,Lon,Day,Day_of_Week,Hour,Minute
0,40.6747,-73.9542,1,Monday,0,30
1,40.7646,-73.9958,1,Monday,0,53
2,40.7006,-73.9936,1,Monday,1,17
3,40.7321,-73.9901,1,Monday,1,39
4,40.7859,-73.9689,1,Monday,2,21


In [19]:
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron"
)

fig.show()

In [20]:
df.tail()

Unnamed: 0,Lat,Lon,Day,Day_of_Week,Hour,Minute
4995,40.7231,-73.9842,30,Tuesday,22,17
4996,40.7408,-74.0056,30,Tuesday,22,20
4997,40.7211,-73.9841,30,Tuesday,22,28
4998,40.7624,-73.9712,30,Tuesday,22,51
4999,40.6821,-73.9212,30,Tuesday,22,51


In [21]:
100*df.isnull().sum()/df.shape[0]
# No missing values

Lat            0.0
Lon            0.0
Day            0.0
Day_of_Week    0.0
Hour           0.0
Minute         0.0
dtype: float64

In [22]:
df.columns

Index(['Lat', 'Lon', 'Day', 'Day_of_Week', 'Hour', 'Minute'], dtype='object')

In [23]:
df.dtypes

Lat            float64
Lon            float64
Day              int64
Day_of_Week     object
Hour             int64
Minute           int64
dtype: object

In [24]:
numeric_features = ['Lat', 'Lon', 'Day', 'Hour', 'Minute'] #colonnes quantitatives dans X
numeric_transformer = StandardScaler()

categorical_features = ['Day_of_Week'] # colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [25]:
X = preprocessor.fit_transform(df)

In [81]:
db = DBSCAN(eps=0.2, min_samples=3)

db.fit(X)
prediction_1 = db.labels_

In [82]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print("Estimated number of clusters: %d" % n_clusters_)

Estimated number of clusters: 7


In [83]:
df["cluster"] = labels

In [84]:
fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="open-street-map")
fig.show()

In [29]:
"""# let's convert Date/Time column to datetime format
df["Date"] = pd.Timestamp(df["Date/Time"]).timestamp()
pd.DatetimeIndex

#df["Timestamp"] = df["[pd.Timestamp('2019-01-15 13:30:00').timestamp()]"""

'# let\'s convert Date/Time column to datetime format\ndf["Date"] = pd.Timestamp(df["Date/Time"]).timestamp()\npd.DatetimeIndex\n\n#df["Timestamp"] = df["[pd.Timestamp(\'2019-01-15 13:30:00\').timestamp()]'