In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans, DBSCAN
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [3]:
zones = pd.read_csv('uber-trip-data/taxi-zone-lookup.csv')
zones.head()

Unnamed: 0,LocationID,Borough,Zone
0,1,EWR,Newark Airport
1,2,Queens,Jamaica Bay
2,3,Bronx,Allerton/Pelham Gardens
3,4,Manhattan,Alphabet City
4,5,Staten Island,Arden Heights


In [5]:
df = pd.read_csv('uber-trip-data/uber-raw-data-apr14.csv')
df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Base
564511,4/30/2014 23:22:00,40.764,-73.9744,B02764
564512,4/30/2014 23:26:00,40.7629,-73.9672,B02764
564513,4/30/2014 23:31:00,40.7443,-73.9889,B02764
564514,4/30/2014 23:32:00,40.6756,-73.9405,B02764
564515,4/30/2014 23:48:00,40.688,-73.9608,B02764


In [6]:
df['Dayofweek'] =  pd.DatetimeIndex(df['Date/Time']).dayofweek

In [7]:
df['hour'] =  pd.DatetimeIndex(df['Date/Time']).hour

In [None]:
data=df.sample(20000)

In [8]:
data = df[[(df['Dayofweek'][i] == 2) & (df['hour'][i] == 20)for i in range(len(df))]]
data.shape

#avril.apply(lambda i :  for i in avril['Date/Time'] if (avril['Date/Time'][i].split('/')[1] == '25'))

(7783, 6)

In [9]:
fig = px.scatter_mapbox(
        data, 
        lat="Lat", 
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron"
)

fig.show()

In [10]:
X = data.iloc[:,[1,2,4,5]]
#X.values
X.shape

(7783, 4)

In [11]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# Création du pipeline pour les variables catégorielles
#categorical_features = [2] # Positions des colonnes catégorielles dans X
#categorical_transformer = Pipeline(
 #   steps=[
  #  ('encoder', OneHotEncoder(drop='first')) # on encode les catégories sous forme de colonnes comportant des 0 et des 1
   # ])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
        #('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(X.head())
X = preprocessor.fit_transform(X) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
          Lat      Lon  Dayofweek  hour
2134  40.7327 -74.0024          2    20
2135  40.7559 -73.9748          2    20
2136  40.7403 -73.9962          2    20
2137  40.7739 -73.8716          2    20
2138  40.7413 -73.9952          2    20
...Terminé.
[[-0.29277797 -0.50866335]
 [ 0.42296107  0.15230571]
 [-0.05831174 -0.3601848 ]
 [ 0.97827584  2.62375525]
 [-0.02746092 -0.33623664]]



In [12]:
'''
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

sc = StandardScaler()
HE = OneHotEncoder()

X.iloc[:,0:2] = sc.fit_transform(X.iloc[:,0:2])
X.loc['Base'].values.reshape(-1,1) = HE.fit(X['Base']).toarray()
'''

"\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import OneHotEncoder\n\nsc = StandardScaler()\nHE = OneHotEncoder()\n\nX.iloc[:,0:2] = sc.fit_transform(X.iloc[:,0:2])\nX.loc['Base'].values.reshape(-1,1) = HE.fit(X['Base']).toarray()\n"

In [13]:
wcss =  []
k = []

for i in range (8,16): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))
    

WCSS for K=8 --> 2616.2749688033473
WCSS for K=9 --> 2258.9227382591307
WCSS for K=10 --> 2026.1512531443293
WCSS for K=11 --> 1811.5986709817134
WCSS for K=12 --> 1650.8010826434956
WCSS for K=13 --> 1516.365128054259
WCSS for K=14 --> 1448.3746356878025
WCSS for K=15 --> 1314.6991081826134


In [14]:
import plotly.express as px

# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

In [15]:
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []
k = []

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels 
for i in range (8,16): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=8 is 0.5160675037855812
Silhouette score for K=9 is 0.4312980678620336
Silhouette score for K=10 is 0.45106241200402153
Silhouette score for K=11 is 0.4558100651343312
Silhouette score for K=12 is 0.4560913921533013
Silhouette score for K=13 is 0.4465442939482821
Silhouette score for K=14 is 0.45642466077390337
Silhouette score for K=15 is 0.43530010664284186


In [16]:
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

In [17]:
kmeans = KMeans(n_clusters=13, random_state=0, init='k-means++')

kmeans.fit(X)

kmeans.cluster_centers_

array([[ -0.03775953,  -0.3474695 ],
       [  2.43693975,   0.81386897],
       [ -2.83463546,   4.72152728],
       [  0.7692248 ,   2.8029245 ],
       [ -0.63104815,  -5.07994342],
       [ -0.92977583,   0.75101967],
       [  9.21011618,   8.98024857],
       [ -0.69819029,  -0.52212   ],
       [ -2.17518172,  -0.02176642],
       [  1.01203178,   0.43426529],
       [  0.49693327,   0.03642106],
       [  4.46090796,   3.02525844],
       [ -8.38032062, -10.16216441]])

In [18]:
kmeans.labels_

array([ 0, 10,  0, ...,  8, 10,  0])

In [19]:
data['cluster_kmeans'] = kmeans.labels_
data.groupby('cluster_kmeans').count().sort_values('hour',ascending=False).head(12)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Date/Time,Lat,Lon,Base,Dayofweek,hour
cluster_kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,2606,2606,2606,2606,2606,2606
0,1980,1980,1980,1980,1980,1980
7,1699,1699,1699,1699,1699,1699
9,479,479,479,479,479,479
8,294,294,294,294,294,294
5,237,237,237,237,237,237
3,168,168,168,168,168,168
1,133,133,133,133,133,133
2,111,111,111,111,111,111
4,34,34,34,34,34,34


In [20]:
fig = px.scatter_mapbox(
        data, 
        lat="Lat", 
        lon="Lon",
        color='cluster_kmeans',
        mapbox_style="carto-positron"
)

fig.show()

In [21]:
db = DBSCAN(eps=0.1, min_samples=100, metric="haversine", algorithm="brute")

# Fit on data 
## No need to normalize data, it already is! 
db.fit(X)

# Visualize with plotly 

## Import go to build our own figure
import plotly.graph_objects as go

fig = go.Figure()

# Loop through each label for our cluster
for i in np.unique(db.labels_):
    label = X[db.labels_ == i]
    fig.add_trace(go.Scatter(x=label[:, 0], y=label[:, 1], mode="markers", name="Cluster {}".format(i)))
    
fig.show()

In [22]:
data['cluster_db'] = db.labels_
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Date/Time,Lat,Lon,Base,Dayofweek,hour,cluster_kmeans,cluster_db
2134,4/2/2014 20:00:00,40.7327,-74.0024,B02512,2,20,0,0
2135,4/2/2014 20:01:00,40.7559,-73.9748,B02512,2,20,10,0
2136,4/2/2014 20:02:00,40.7403,-73.9962,B02512,2,20,0,0
2137,4/2/2014 20:02:00,40.7739,-73.8716,B02512,2,20,3,-1
2138,4/2/2014 20:02:00,40.7413,-73.9952,B02512,2,20,0,0
...,...,...,...,...,...,...,...,...
564439,4/30/2014 20:55:00,40.7552,-73.9756,B02764,2,20,10,0
564440,4/30/2014 20:55:00,40.7199,-74.0080,B02764,2,20,7,0
564441,4/30/2014 20:58:00,40.6699,-73.9591,B02764,2,20,8,-1
564442,4/30/2014 20:58:00,40.7518,-73.9867,B02764,2,20,10,0


In [23]:
data['cluster_db'].value_counts()

 0    5960
-1    1823
Name: cluster_db, dtype: int64

In [24]:
fig = px.scatter_mapbox(
        data, 
        lat="Lat", 
        lon="Lon",
        color="cluster_db",
        mapbox_style="carto-positron"
)

fig.show()