In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

### EDA and Preprocessing with Pandas

In [2]:
# Importing data from S3 bucket
url = 'https://uber-28-02-2023.s3.eu-west-3.amazonaws.com/uber-raw-data-apr14.csv'
df = pd.read_csv(url)

display(df.head())

print("Number of rows in the DataFrame: ", len(df))

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


Number of rows in the DataFrame:  564516


- 'Date/Time' column contains the time of a client was picked up by a Uber driver
- 'Lat' and 'Lon' are the geographical coordinates of the place where the client was picked up
- 'Base': TLC base company code affiliated with the Uber pickup. In fact, unlike in other cities, a driver needs to have a licence from the TLC (Taxi and Limousin Company) to drive in New York. I will consider that this is purely administrative information and will not take this column into account for further analysis.

In [3]:
# Checking if there are missing values in the dataframe
100*df.isnull().sum()/df.shape[0]

Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [4]:
# Checking data types in the dataframe
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object

The 'Date/Time' column contains data in object (string) format. We will convert this column to datetime data type in order to extract useful features from this columns.

In [5]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"], format='%m/%d/%Y %H:%M:%S')

print(df.dtypes)

df.head()

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object


Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-01 00:11:00,40.769,-73.9549,B02512
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512


It looks like the time of a pickup is precise up to a minute. 
Let us check whether there are only one or more Uber pickups for a given timestamp.

In [6]:
print("Top 10 timestamps with maximum pickups: ")
display(df['Date/Time'].value_counts().head(10))

print()
print("Ten timestamps with the minimum pickups: ")
display(df['Date/Time'].value_counts().tail(10))

Top 10 timestamps with maximum pickups: 


2014-04-07 20:21:00    97
2014-04-07 20:22:00    87
2014-04-30 17:45:00    78
2014-04-30 18:43:00    70
2014-04-30 19:00:00    70
2014-04-30 16:55:00    67
2014-04-30 20:03:00    65
2014-04-30 19:02:00    65
2014-04-30 17:10:00    64
2014-04-07 20:23:00    62
Name: Date/Time, dtype: int64


Ten timestamps with the minimum pickups: 


2014-04-10 00:00:00    1
2014-04-29 05:18:00    1
2014-04-10 00:51:00    1
2014-04-10 01:19:00    1
2014-04-10 01:21:00    1
2014-04-29 04:08:00    1
2014-04-27 09:26:00    1
2014-04-29 03:27:00    1
2014-04-29 03:16:00    1
2014-04-30 02:55:00    1
Name: Date/Time, dtype: int64

At a first sight, it looks like the maximum number of pickups take place in the evening from about 17h00 to 20h30, and the least pickups take place late at night from midnight to around 5h in the morning.

Intuitively, it seems that we might be able to see patterns in pickup points depending on the day of the week and on the hour of the day. Let us use time series to see if the number of pickups is cyclical in nature. If it is, it might be useful for further analysis of the data.

In [7]:
pickup_time_df = df.groupby(by = "Date/Time").size().to_frame().reset_index()
pickup_time_df.rename(columns = {0:'nb_pickups'}, inplace = True)
pickup_time_df.head()

Unnamed: 0,Date/Time,nb_pickups
0,2014-04-01 00:00:00,3
1,2014-04-01 00:01:00,1
2,2014-04-01 00:02:00,3
3,2014-04-01 00:03:00,2
4,2014-04-01 00:04:00,1


In [8]:
fig = px.line(x=pickup_time_df["Date/Time"], y = pickup_time_df["nb_pickups"])
fig.update_layout(
    title="Evolution of number of Uber pickups over a month",
    xaxis_title="Time",
    yaxis_title="Nb pickups"
    )
fig.show()

30 cycles are clearly seen for the month of April, one cycle per day. If we zoom in to look at what happens during a single day, we can see the morning peak at approximately 8h to 10h in the morning and the evening peak that lasts from about 17h to 22h. If we zoom in to look at the evening peak, we can see that it is in its turn consists of two peaks separated by a trough at around 20h-20h30.

For performance reasons, we will take a sample of 10000 rows from the dataframe for further analysis.

In [9]:
df = df.sample(n=10000)
df = df.reset_index(drop=True)

Let us visualize what these pickup points look like on a map.

In [10]:
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="open-street-map"
)

fig.show()

As we have displayed all 10 000 points of the dataframe at once, they are very close to each other and it's difficult to see any pattern. For visualisation purposes, it might be more interesting not to visualize all pickup points known for the whole month, but to look at the daily or even hourly slots.

### Preprocessing with Pandas and feature engineering

In [11]:
# Dropping the 'Base' column as explained before
df = df.drop('Base', axis=1)

# Sorting dataframe in chronological order
df = df.sort_values("Date/Time").reset_index(drop=True)

# Extracting features from timestamps for future use
df["Day_of_Month"] = pd.DatetimeIndex(df["Date/Time"]).day
df["Week_Number"] = df["Date/Time"].dt.isocalendar().week
df['Day_of_Week'] = pd.to_datetime(df["Date/Time"]).dt.day_name()
df['Hour'] = df["Date/Time"].dt.hour
#df['Minute'] = df["Date/Time"].dt.minute


In [12]:
# Because of presumed daily and weekly cycles in activity, let us create a column 
# that will contain combined information on the day of the week and the hour
df["Day_of_Week_and_Hour"] = df["Day_of_Week"] + "_" + df["Hour"].astype(str)+"_hrs"

In [13]:
"""day_of_week_df = df.groupby(by = "Day_of_Week").size().to_frame().reset_index()
day_of_week_df"""

'day_of_week_df = df.groupby(by = "Day_of_Week").size().to_frame().reset_index()\nday_of_week_df'

In [14]:
hourly_df = df.groupby(by = "Day_of_Week_and_Hour").size().to_frame().reset_index()
hourly_df.rename(columns = {0:'nb_pickups'}, inplace = True)
hourly_df.head(5)

Unnamed: 0,Day_of_Week_and_Hour,nb_pickups
0,Friday_0_hrs,34
1,Friday_10_hrs,46
2,Friday_11_hrs,51
3,Friday_12_hrs,56
4,Friday_13_hrs,54


In [15]:
hourly_df = hourly_df.sort_values(by=['nb_pickups'], ascending=False)
hourly_df.reset_index(inplace=True, drop=True)

print("Ten hours in a week with maximum pickups: ")
display(hourly_df.head(10))

print()
print("Ten hours in a week with the fewest pickups: ")
display(hourly_df.tail(10))

Ten hours in a week with maximum pickups: 


Unnamed: 0,Day_of_Week_and_Hour,nb_pickups
0,Wednesday_17_hrs,188
1,Wednesday_16_hrs,178
2,Wednesday_18_hrs,166
3,Wednesday_20_hrs,141
4,Tuesday_16_hrs,138
5,Wednesday_19_hrs,131
6,Tuesday_17_hrs,131
7,Friday_19_hrs,130
8,Thursday_20_hrs,130
9,Friday_22_hrs,125



Ten hours in a week with the fewest pickups: 


Unnamed: 0,Day_of_Week_and_Hour,nb_pickups
158,Monday_3_hrs,9
159,Thursday_1_hrs,7
160,Wednesday_2_hrs,7
161,Thursday_3_hrs,7
162,Wednesday_1_hrs,6
163,Monday_1_hrs,5
164,Monday_2_hrs,4
165,Thursday_2_hrs,4
166,Tuesday_2_hrs,3
167,Tuesday_1_hrs,3


We can see that late weekday nights have the fewest pickups, and weekday evening rush hours have the most pickups.

In [16]:
#df["Year"] = pd.DatetimeIndex(df["Date/Time"]).year
#df["Month"] = pd.DatetimeIndex(df["Date/Time"]).month

In [17]:
# Lets us look at an animated map where each frame shows pickup points in a given hour of a day
fig = px.scatter_mapbox(df, 
                        lat="Lat", lon="Lon", animation_frame='Hour', zoom=10,
                        mapbox_style="open-street-map")
fig.show()

In [18]:
# Lets us look at an animated map where each frame shows pickup points in a given hour of a day
fig = px.scatter_mapbox(df, 
                        lat="Lat", lon="Lon", animation_frame='Day_of_Week_and_Hour', zoom=10,
                        mapbox_style="open-street-map")
fig.show()

### Preprocessing with Scikit-Learn

In [19]:
#numeric_features = ['Lat', 'Lon', 'Day_of_Month', 'Hour', 'Minute']
numeric_features = ['Lat', 'Lon', 'Day_of_Month']
numeric_transformer = StandardScaler()

categorical_features = ['Day_of_Week', "Day_of_Week_and_Hour", 'Hour']
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [20]:
X = preprocessor.fit_transform(df)
X[:5]

<5x199 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

### Applying K-Means

In [21]:
# Instanciating KMeans with k=3 and initialisation with k-means++
kmeans = KMeans(n_clusters=3, random_state=0)

# Fitting K_means to our dataset
kmeans.fit(X)

prediction = kmeans.labels_

prediction

array([2, 2, 2, ..., 1, 1, 1])

In [25]:
df["cluster"] = prediction

df.tail()

Unnamed: 0,Date/Time,Lat,Lon,Day_of_Month,Week_Number,Day_of_Week,Hour,Day_of_Week_and_Hour,cluster
9995,2014-04-30 23:36:00,40.6951,-74.1783,30,18,Wednesday,23,Wednesday_23_hrs,1
9996,2014-04-30 23:37:00,40.7603,-73.9885,30,18,Wednesday,23,Wednesday_23_hrs,1
9997,2014-04-30 23:40:00,40.7741,-73.8724,30,18,Wednesday,23,Wednesday_23_hrs,1
9998,2014-04-30 23:51:00,40.7506,-73.9919,30,18,Wednesday,23,Wednesday_23_hrs,1
9999,2014-04-30 23:52:00,40.7282,-73.982,30,18,Wednesday,23,Wednesday_23_hrs,1


In [26]:
fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="open-street-map")
fig.show()

In [27]:
# Lets us look at an animated map where each frame shows pickup points in a given day
fig = px.scatter_mapbox(df, 
                        lat="Lat", lon="Lon", animation_frame='Day_of_Month', color="cluster",
                        mapbox_style="open-street-map")
fig.show()

In [28]:
# Lets us look at an animated map where each frame shows pickup points in a given day
fig = px.scatter_mapbox(df, 
                        lat="Lat", lon="Lon", animation_frame='Day_of_Week', color="cluster",
                        mapbox_style="open-street-map")

fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=10),
)

fig.show()

In [29]:
# Lets us look at an animated map where each frame shows pickup points in a given hour of a day
fig = px.scatter_mapbox(df, 
                        lat="Lat", lon="Lon", animation_frame='Day_of_Week_and_Hour', zoom=10,
                        color="cluster",
                        mapbox_style="open-street-map")
fig.show()