# Import Librairies

In [427]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

import geopy.distance

from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=px.colors.qualitative.Safe
)
pio.templates.default = "jedha"

# Explore files

1. Exploring *taxi-zone-lookup*

In [3]:
dataset = pd.read_csv('uber-trip-data/taxi-zone-lookup.csv')
dataset

Unnamed: 0,LocationID,Borough,Zone
0,1,EWR,Newark Airport
1,2,Queens,Jamaica Bay
2,3,Bronx,Allerton/Pelham Gardens
3,4,Manhattan,Alphabet City
4,5,Staten Island,Arden Heights
...,...,...,...
260,261,Manhattan,World Trade Center
261,262,Manhattan,Yorkville East
262,263,Manhattan,Yorkville West
263,264,Unknown,Unknown


In [4]:
zones_df=dataset

We see here that we are in NewYork, and the file is listing all zones of this city

2. Exploring *uber-raw-data*

In [5]:
dataset = pd.read_csv('uber-trip-data/uber-raw-data-apr14.csv')
dataset

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512
...,...,...,...,...
564511,4/30/2014 23:22:00,40.7640,-73.9744,B02764
564512,4/30/2014 23:26:00,40.7629,-73.9672,B02764
564513,4/30/2014 23:31:00,40.7443,-73.9889,B02764
564514,4/30/2014 23:32:00,40.6756,-73.9405,B02764


In [6]:
dataset.describe(include='all')

Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,


Let's convert the Date/Time column and exctract the *Day*, *Day_of_week* and *Hour*

In [7]:
dataset['Date/Time']=pd.to_datetime(dataset['Date/Time'])
dataset['Day']=dataset['Date/Time'].dt.day
dataset['Day_of_week']=dataset['Date/Time'].dt.dayofweek
dataset['Hour']=dataset['Date/Time'].dt.hour

In [8]:
dataset.describe(include='all')

  dataset.describe(include='all')


Unnamed: 0,Date/Time,Lat,Lon,Base,Day,Day_of_week,Hour
count,564516,564516.0,564516.0,564516,564516.0,564516.0,564516.0
unique,41999,,,5,,,
top,2014-04-07 20:21:00,,,B02682,,,
freq,97,,,227808,,,
first,2014-04-01 00:00:00,,,,,,
last,2014-04-30 23:59:00,,,,,,
mean,,40.740005,-73.976817,,16.117127,2.86698,14.465043
std,,0.036083,0.050426,,9.048139,1.82081,5.873925
min,,40.0729,-74.7733,,1.0,0.0,0.0
25%,,40.7225,-73.9977,,8.0,1.0,10.0


Let's drop the *Base* column since we just need to know the places with high pick-ups rates per hour 

In [13]:
dataset=dataset.drop(columns='Base')

In [14]:
data_temp=dataset.loc[dataset['Day']==1]

In [15]:
data_temp

Unnamed: 0,Date/Time,Lat,Lon,Day,Day_of_week,Hour
0,2014-04-01 00:11:00,40.7690,-73.9549,1,1,0
1,2014-04-01 00:17:00,40.7267,-74.0345,1,1,0
2,2014-04-01 00:21:00,40.7316,-73.9873,1,1,0
3,2014-04-01 00:28:00,40.7588,-73.9776,1,1,0
4,2014-04-01 00:33:00,40.7594,-73.9722,1,1,0
...,...,...,...,...,...,...
554926,2014-04-01 23:21:00,40.7219,-73.9920,1,1,23
554927,2014-04-01 23:25:00,40.7261,-74.0027,1,1,23
554928,2014-04-01 23:38:00,40.7364,-73.9926,1,1,23
554929,2014-04-01 23:41:00,40.7149,-73.9405,1,1,23


In [17]:
fig = px.scatter_mapbox(data_temp,
                        #hover_name="Base", hover_data=[], 
                        lat="Lat", lon="Lon", 
                        #color="Base",
                        #size='', size_max=15,
                        animation_frame = "Hour", #animation_group = "Base",
                        zoom=10, mapbox_style="carto-positron",width=800, height=800)
fig.update_layout(title='toto')
fig.show()

# Using KMeans

**Let's find our hotspots in an hour**

Since we're going to check all hot spots for each hour and day_of_week, let's do our first classification for *Hour=12 / Day_of_week=1*

In [125]:
data_temp=dataset.loc[(dataset['Day_of_week']==1)&(dataset['Hour']==12),['Lat', 'Lon']]

In [126]:
data_temp.head()

Unnamed: 0,Lat,Lon
344,40.7852,-74.022
345,40.7852,-74.022
346,40.6878,-74.1817
347,40.6864,-73.9747
348,40.8289,-73.9451


!! ALL GOOD !!

Let's now normalize our Data_temp : 

In [127]:
sc = StandardScaler()
X=data_temp.values
print('Before Normalization : ')
print(X[:5,])

# Apply StandardScaler to X
X = sc.fit_transform(X)
print('After Normalization : ')
print(X[:5,])


Before Normalization : 
[[ 40.7852 -74.022 ]
 [ 40.7852 -74.022 ]
 [ 40.6878 -74.1817]
 [ 40.6864 -73.9747]
 [ 40.8289 -73.9451]]
After Normalization : 
[[ 1.17668532 -0.82457114]
 [ 1.17668532 -0.82457114]
 [-1.4841723  -3.72708441]
 [-1.52241871  0.03509623]
 [ 2.3705198   0.57306988]]


In [130]:
# Let's create a loop that will collect the Within-sum-of-square (wcss) for each value K 
# Let's use .inertia_ parameter to get the within sum of square value for each value K 
wcss =  []
k = []
for i in range (1,20): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=12.



WCSS for K=1 --> 5637.999999999998
WCSS for K=2 --> 4101.488143624183
WCSS for K=3 --> 3002.941685911461
WCSS for K=4 --> 2312.7216921357076
WCSS for K=5 --> 1745.8744288096686
WCSS for K=6 --> 1434.6993765898412
WCSS for K=7 --> 1136.776946277298
WCSS for K=8 --> 978.6797146946018
WCSS for K=9 --> 842.5476213664017
WCSS for K=10 --> 770.5877966277599
WCSS for K=11 --> 696.492271308614
WCSS for K=12 --> 615.9243019666433
WCSS for K=13 --> 551.3396454350208
WCSS for K=14 --> 496.60439685383653
WCSS for K=15 --> 456.0460411011361
WCSS for K=16 --> 423.31620625273445
WCSS for K=17 --> 384.0655282081003
WCSS for K=18 --> 353.8947086618156
WCSS for K=19 --> 330.8017582921483


In [131]:
# Let's visualize using plotly
import plotly.express as px

# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)


As we can see, after K=5 or K=8 WCSS is not decreasing a whole lot.

Let's now double check with the Silhouette method.

In [133]:
# Computer mean silhouette score
sil = []
k = []

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels 
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.3921641414664743
Silhouette score for K=3 is 0.42340042017594837
Silhouette score for K=4 is 0.44428103076443504
Silhouette score for K=5 is 0.4819541019976179
Silhouette score for K=6 is 0.4736753598984761
Silhouette score for K=7 is 0.48855798084020313
Silhouette score for K=8 is 0.4955327159434642
Silhouette score for K=9 is 0.42649936818283257
Silhouette score for K=10 is 0.4289779370536653


In [134]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

**K=8** is best but we're going to take **K=10** to have shorter distances, so we can go neary everywhere under 7min

Let's see each cluster in a map to have a look of how it went

- **K=10**

In [344]:
n_clusters=10
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

KMeans(n_clusters=10)

In [345]:
#Find the Cluster centers
cluster_centers = sc.inverse_transform(kmeans.cluster_centers_)
cluster_centers

array([[ 40.71367262, -73.94625119],
       [ 40.76837612, -73.86432388],
       [ 40.64732848, -73.78371576],
       [ 40.69402308, -74.18034615],
       [ 40.7437593 , -73.99389942],
       [ 40.62835238, -73.96120952],
       [ 40.82388   , -73.9371025 ],
       [ 40.67868242, -73.96431648],
       [ 40.72240437, -73.99822022],
       [ 40.76576923, -73.97324406]])

In [346]:
#Complete our dataset with the centers to show them in the map
X_temp=sc.inverse_transform(X)
print(X_temp.shape)

X_temp=np.concatenate((X_temp,cluster_centers))
print(X_temp.shape)

(1018, 2)
(1028, 2)


In [347]:
#Make the prediction
Y=kmeans.predict(X)
print(Y.shape)

#Order by hot spots : [0 : Hotspot , ---- ,n_clusters-1 : Not hotspot ]
(unique, counts) = np.unique(Y, return_counts=True)
order=np.argsort(counts)[::-1]
order=[np.where(order==i)[0][0] for i in range(n_clusters)]

Y=np.array([order[i] for i in Y])


#Give the centers diffrent names so we can find them in map
Y_centers=np.array(['center'+str(i) for i in order])
print(Y_centers.shape)

Y=np.concatenate((Y,Y_centers))
print(Y[-5:])

(1018,)
(10,)
['center9' 'center8' 'center4' 'center0' 'center3']


In [348]:
# Create the dataframe to plot
data_temp=pd.DataFrame(X_temp,columns=['Lat','Lon'])
data_temp['cluster']=Y

In [349]:
fig = px.scatter_mapbox(data_temp,
                        #hover_name="Base", hover_data=[], 
                        lat="Lat", lon="Lon", 
                        color="cluster",
                        #size='', size_max=15,
                        #animation_frame = "Hour", #animation_group = "Base",
                        zoom=10, mapbox_style="carto-positron",width=800, height=800)
fig.update_layout(title='toto')
fig.show()

In [350]:
data_temp.groupby('cluster').count().reset_index().sort_values('Lat', ascending =False)

Unnamed: 0,cluster,Lat,Lon
0,0,182,182
1,1,173,173
2,2,165,165
3,3,143,143
4,4,91,91
5,5,84,84
6,6,67,67
7,7,52,52
8,8,40,40
9,9,21,21


Vitesse moyenne d'une voiture en ville : entre 21 km/h et 12km/h aux heures de pointe.

In [351]:
def fill_distance_temps(dataset,min_speed=15,max_speed=21):
    distances=[]

    for i in range(dataset.shape[0]):
        
        point=dataset.iloc[i]
        coords=(point['Lat'],point['Lon'])

        try : 
            center='center'+str(int(point['cluster']))
        except :
            center=point['cluster']
        
        point_center_index = dataset.loc[dataset['cluster']==center].index[0]
        point_center = dataset.iloc[point_center_index]
        coords_center=(point_center['Lat'],point_center['Lon'])

        distances.append(geopy.distance.distance(coords, coords_center).km)

    dataset['distances']=distances

    # let's say that the car is speending at an average of 20Km/h
    dataset['time_to_point_min']=dataset['distances']*60/max_speed
    dataset['time_to_point_max']=dataset['distances']*60/min_speed


In [390]:
Y_centers

array(['center0', 'center1', 'center4', 'center7', 'center6', 'center3',
       'center9', 'center2', 'center5', 'center8'], dtype='<U7')

In [391]:
np.argsort(Y_centers)

array([0, 1, 7, 5, 2, 8, 4, 3, 9, 6], dtype=int64)

In [397]:
cluster_centers[2][0]

40.77065401069519

In [398]:
[(cluster_centers[i][0],cluster_centers[i][1]) for i in np.argsort(Y_centers)]

[(40.72757778819119, -73.99552108716026),
 (40.75492257142857, -73.98497471428571),
 (40.78055959004392, -73.96214040995608),
 (40.67640862745098, -73.97708588235294),
 (40.77065401069519, -73.87090802139038),
 (40.71367278911565, -73.94683401360544),
 (40.65761333333334, -73.78945555555556),
 (40.712754545454544, -74.18129696969697),
 (40.75826666666667, -73.66325),
 (40.969944444444444, -73.82298888888889)]

In [410]:
#Faster function to fill distances and times 
coords_centers = [(cluster_centers[i][0],cluster_centers[i][1]) for i in np.argsort(Y_centers)]

def fill_distance_temps2(dataset,min_speed=15,max_speed=21):
    dataset['distances']=dataset.apply(lambda x: geopy.distance.distance( ( x['Lat'], x['Lon'] ), coords_centers[ int(x['cluster'][-1]) ]).km, axis =1)

    # let's say that the car is speending at an average of 20Km/h
    dataset['time_to_point_min']=dataset['distances']*60/max_speed
    dataset['time_to_point_max']=dataset['distances']*60/min_speed


In [411]:
fill_distance_temps2(data_temp,min_speed=8)

In [412]:
data_temp

Unnamed: 0,Lat,Lon,cluster,Day_of_week,Hour,distances,time_to_point_min,time_to_point_max
0,40.720500,-73.996800,0,0,8,0.652937,1.865535,4.897031
1,40.774700,-73.990100,2,0,8,2.316613,6.618895,17.374600
2,40.774100,-73.872300,4,0,8,0.394883,1.128238,2.961625
3,40.735400,-74.003900,0,0,8,1.209917,3.456905,9.074376
4,40.730700,-73.985200,0,0,8,1.028131,2.937516,7.710980
...,...,...,...,...,...,...,...,...
3143,40.726343,-73.995938,center0,0,8,0.000000,0.000000,0.000000
3144,40.658030,-73.787216,center6,0,8,0.000000,0.000000,0.000000
3145,40.969944,-73.822989,center9,0,8,0.000000,0.000000,0.000000
3146,40.714046,-73.946898,center5,0,8,0.000000,0.000000,0.000000


In [406]:
fill_distance_temps(data_temp,min_speed=8)

In [353]:
data_temp.loc[data_temp['cluster'].isin(['0','1','2','3'])].describe()

Unnamed: 0,Lat,Lon,distances,time_to_point_min,time_to_point_max
count,663.0,663.0,663.0,663.0,663.0
mean,40.718614,-73.938329,1.03022,2.943485,7.726648
std,0.04446,0.090439,0.929449,2.655567,6.970864
min,40.6415,-74.0444,0.090581,0.258803,0.679359
25%,40.70535,-73.9972,0.487506,1.392875,3.656296
50%,40.7316,-73.9859,0.8498,2.428,6.373501
75%,40.75045,-73.92295,1.284794,3.670839,9.635953
max,40.7931,-73.7021,11.263156,32.180446,84.473672


In [354]:
data_temp.loc[(data_temp['time_to_point_min']<=7)].shape[0]/data_temp.shape[0]*100

88.715953307393

In [355]:
data_temp.loc[(data_temp['time_to_point_max']<=7) & (data_temp['cluster'].isin(['0','1','2','3']))].shape[0]/data_temp.loc[data_temp['cluster'].isin(['0','1','2','3'])].shape[0]*100

54.90196078431373

let's see the evolution of our clusters by day and hour

In [375]:
sc.inverse_transform(X)

array([[ 40.688 , -74.181 ],
       [ 40.7145, -73.9906],
       [ 40.7415, -73.9875],
       ...,
       [ 40.6449, -73.7823],
       [ 40.7278, -73.9822],
       [ 40.6879, -74.1813]])

In [None]:
n_clusters=10

data_final=pd.DataFrame(columns=['Lat','Lon','cluster','Day_of_week','Hour','distances','time_to_point_min','time_to_point_max'])
data_models=pd.DataFrame(columns=['Day_of_week','Hour','model','order'])

for day in range(7):
    print('------------ Day : ' + str(day) + '-----------------')
    for hour in tqdm(range(24)):
        index='d'+str(day)+'h'+str(hour+1)
        data_models.loc[index, 'Day_of_week'] = 1
        data_models.loc[index, 'Hour'] = hour

        data_temp=dataset.loc[(dataset['Day_of_week']==day)&(dataset['Hour']==hour),['Lat', 'Lon']]
        X=data_temp.values
        

        #Normalisation
        sc = StandardScaler()
        X = sc.fit_transform(X)

        #Fit a model
        kmeans = KMeans(n_clusters=n_clusters)
        kmeans.fit(X)

        data_models.loc[index, 'model'] = kmeans

        #Find the Cluster centers
        cluster_centers = sc.inverse_transform(kmeans.cluster_centers_)
        
        #Complete our dataset with the centers to show them in the map
        X_temp=sc.inverse_transform(X)
        X_temp=np.concatenate((X_temp,cluster_centers))
        

        #Make the prediction
        Y=kmeans.predict(X)

        #Order by hot spots : [0 : Hotspot , ---- ,n_clusters-1 : Not hotspot ]
        (unique, counts) = np.unique(Y, return_counts=True)
        order=np.argsort(counts)[::-1]
        order=[np.where(order==i)[0][0] for i in range(n_clusters)]

        data_models.loc[index, 'order'] = order

        Y=np.array([order[i] for i in Y])


        #Give the centers diffrent names so we can find them in map
        Y_centers=np.array(['center'+str(i) for i in order])
        Y=np.concatenate((Y,Y_centers))

        # Create the dataframe to plot
        data_temp=pd.DataFrame(X_temp,columns=['Lat','Lon'])
        data_temp['cluster']=Y
        data_temp['Day_of_week']=day
        data_temp['Hour']=hour

        coords_centers = [(cluster_centers[i][0],cluster_centers[i][1]) for i in np.argsort(Y_centers)]
        fill_distance_temps2(data_temp,min_speed=8)

        data_final=data_final.append(data_temp)

------------ Day : 0-----------------


100%|██████████| 24/24 [00:13<00:00,  1.72it/s]


------------ Day : 1-----------------


100%|██████████| 24/24 [00:20<00:00,  1.15it/s]


------------ Day : 2-----------------


100%|██████████| 24/24 [00:24<00:00,  1.02s/it]


------------ Day : 3-----------------


100%|██████████| 24/24 [00:19<00:00,  1.24it/s]


------------ Day : 4-----------------


100%|██████████| 24/24 [00:20<00:00,  1.15it/s]


------------ Day : 5-----------------


100%|██████████| 24/24 [00:18<00:00,  1.32it/s]


------------ Day : 6-----------------


100%|██████████| 24/24 [00:12<00:00,  1.97it/s]


In [524]:
days=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
data_final['Day_name'] = data_final['Day_of_week'].apply(lambda x:days[x])
data_final['DH']=data_final['Day_name']+', '+data_final['Hour'].astype("string")+' h'
data_final=data_final.sort_values('cluster')

In [None]:
fig = px.scatter_mapbox(data_final,
                        #hover_name="Base", hover_data=[], 
                        lat="Lat", lon="Lon", 
                        color="cluster",
                        #size='', size_max=15,
                        animation_frame = 'DH', #animation_group = "Base",
                        zoom=10, mapbox_style="carto-positron",width=800, height=800)
fig.update_layout(title='Uber Hotspots in NYC')
fig.show()

In [526]:
print('--train-- Score max : under 7 min with 21km/h ')
print(data_final.loc[(data_final['time_to_point_min']<=7)].shape[0]/data_final.shape[0]*100,'%')

print('--train-- Score min : under 7 min with 8km/h, full jam ')
print(data_final.loc[(data_final['time_to_point_max']<=7) & (data_final['cluster'].isin(['0','1','2','3']))].shape[0]/data_final.loc[data_final['cluster'].isin(['0','1','2','3'])].shape[0]*100,'%')

--train-- Score max : under 7 min with 21km/h 
89.66559283357707 %
--train-- Score min : under 7 min with 8km/h, full jam 
41.36784303060359 %


In [362]:
dataset.shape

(564516, 6)

In [419]:
dataset2=pd.read_csv('uber-trip-data/uber-raw-data-aug14.csv')

#Preprocessing
dataset2['Date/Time']=pd.to_datetime(dataset2['Date/Time'])
dataset2['Day']=dataset2['Date/Time'].dt.day
dataset2['Day_of_week']=dataset2['Date/Time'].dt.dayofweek
dataset2['Hour']=dataset2['Date/Time'].dt.hour

In [420]:
dataset2.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Day,Day_of_week,Hour
0,2014-08-01 00:03:00,40.7366,-73.9906,B02512,1,4,0
1,2014-08-01 00:09:00,40.726,-73.9918,B02512,1,4,0
2,2014-08-01 00:12:00,40.7209,-74.0507,B02512,1,4,0
3,2014-08-01 00:12:00,40.7387,-73.9856,B02512,1,4,0
4,2014-08-01 00:12:00,40.7323,-74.0077,B02512,1,4,0


In [421]:
dataset2.shape

(829275, 7)

In [422]:
#Predictions for August with models trained on april-dataset

data_final=pd.DataFrame(columns=['Lat','Lon','cluster','Day_of_week','Hour','distances','time_to_point_min','time_to_point_max'])

for day in range(7):
    print('------------ Day : ' + str(day) + '-----------------')
    for hour in tqdm(range(24)):
        index='d'+str(day)+'h'+str(hour+1)

        data_temp=dataset.loc[(dataset2['Day_of_week']==day)&(dataset['Hour']==hour),['Lat', 'Lon']]
        X=data_temp.values
        

        #Normalisation
        sc = StandardScaler()
        X = sc.fit_transform(X)

        #Fit a model
        kmeans = data_models.loc[index,'model']

        #Find the Cluster centers
        cluster_centers = sc.inverse_transform(kmeans.cluster_centers_)
        
        #Complete our dataset with the centers to show them in the map
        X_temp=sc.inverse_transform(X)
        X_temp=np.concatenate((X_temp,cluster_centers))
        

        #Make the prediction
        Y=kmeans.predict(X)

        #Order by hot spots : [0 : Hotspot , ---- ,n_clusters-1 : Not hotspot ]
        order=data_models.loc[index, 'order']  

        Y=np.array([order[i] for i in Y])


        #Give the centers diffrent names so we can find them in map
        Y_centers=np.array(['center'+str(i) for i in order])
        Y=np.concatenate((Y,Y_centers))

        # Create the dataframe to plot
        data_temp=pd.DataFrame(X_temp,columns=['Lat','Lon'])
        data_temp['cluster']=Y
        data_temp['Day_of_week']=day
        data_temp['Hour']=hour

        coords_centers = [(cluster_centers[i][0],cluster_centers[i][1]) for i in np.argsort(Y_centers)]
        fill_distance_temps2(data_temp,min_speed=8)

        data_final=data_final.append(data_temp)

------------ Day : 0-----------------


100%|██████████| 24/24 [00:18<00:00,  1.28it/s]


------------ Day : 1-----------------


100%|██████████| 24/24 [00:21<00:00,  1.13it/s]


------------ Day : 2-----------------


100%|██████████| 24/24 [00:22<00:00,  1.07it/s]


------------ Day : 3-----------------


100%|██████████| 24/24 [00:21<00:00,  1.10it/s]


------------ Day : 4-----------------


100%|██████████| 24/24 [00:26<00:00,  1.09s/it]


------------ Day : 5-----------------


100%|██████████| 24/24 [00:23<00:00,  1.02it/s]


------------ Day : 6-----------------


100%|██████████| 24/24 [00:19<00:00,  1.20it/s]


In [423]:
print('--test-- Score max : under 7 min with 21km/h ')
print(data_final.loc[(data_final['time_to_point_min']<=7)].shape[0]/data_final.shape[0]*100,'%')

print('--test-- Score min : under 7 min with 8km/h, full jam ')
print(data_final.loc[(data_final['time_to_point_max']<=7) & (data_final['cluster'].isin(['0','1','2','3']))].shape[0]/data_final.loc[data_final['cluster'].isin(['0','1','2','3'])].shape[0]*100,'%')

--test-- Score max : under 7 min with 21km/h 
86.98772156638337 %
--test-- Score min : under 7 min with 8km/h, full jam 
36.18878734142682 %


Still a very good score 

# Using DBSCAN

In [424]:
data_temp=dataset.loc[(dataset['Day_of_week']==1)&(dataset['Hour']==12),['Lat', 'Lon']]

In [425]:
sc = StandardScaler()
X=data_temp.values
print('Before Normalization : ')
print(X[:5,])

# Apply StandardScaler to X
X = sc.fit_transform(X)
print('After Normalization : ')
print(X[:5,])


Before Normalization : 
[[ 40.7852 -74.022 ]
 [ 40.7852 -74.022 ]
 [ 40.6878 -74.1817]
 [ 40.6864 -73.9747]
 [ 40.8289 -73.9451]]
After Normalization : 
[[ 1.17668532 -0.82457114]
 [ 1.17668532 -0.82457114]
 [-1.4841723  -3.72708441]
 [-1.52241871  0.03509623]
 [ 2.3705198   0.57306988]]


In [474]:
# Instanciate DBSCAN with manhattan distance 
db = DBSCAN(eps=0.2, min_samples=10, metric="manhattan", algorithm="brute") #we're using the manhattan metric because no car fly directly to destination

# Fit on data 
## No need to normalize data, it already is! 
db.fit(X)

DBSCAN(algorithm='brute', eps=0.2, metric='manhattan', min_samples=10)

In [475]:
db.labels_.shape

(2819,)

In [476]:
data_temp.shape

(2819, 3)

In [514]:
#Make the prediction
Y=db.labels_
print(Y.shape)

Y_temp=np.array([i for i in Y if i!=-1])

#Order by hot spots : [0 : Hotspot , ---- ,n_clusters-1 : Not hotspot ]
(unique, counts) = np.unique(Y_temp, return_counts=True)
order=np.argsort(counts)[::-1]
order=[np.where(order==i)[0][0] for i in range(len(unique))]
order.append(len(unique))

Y=np.array([order[i] for i in Y])
data_temp['cluster']=[str(i) for i in Y]

(2819,)


In [521]:
data_temp=data_temp.sort_values('cluster', ascending=False)

In [None]:
fig = px.scatter_mapbox(data_temp,
                        #hover_name="Base", hover_data=[], 
                        lat="Lat", lon="Lon", 
                        color="cluster",
                        #size='', size_max=15,
                        #animation_frame = "Hour", #animation_group = "Base",
                        zoom=10, mapbox_style="carto-positron",width=800, height=800)
fig.update_layout(title='toto')
fig.show()

DbScan is not an approriate model :
- It doesn't give us precise zones which we can move around under 7min
- We can't predict like kmeans the zone of each location, which means that we can't advise to a driver the nerest center to be in a hot zone 