In [2]:
from utils import *
v = Voyager()

In [3]:
# read chicago_trips data
df = pd.read_csv('./data/chicago_taxi/chicago_taxi_trips.csv')
# read NetCDF weather data
cw_files_list = glob.glob('./data/chicago_taxi/chicago_weather/*.nc')

In [4]:
df

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_miles,fare,tips,tolls,extras,trip_total,payment_type,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,2022-09-30 07:00:00+00:00,2022-09-30 08:00:00+00:00,22.80,61.50,0.00,0.0,0.0,61.50,Cash,41.761578,-87.572782,41.741243,-87.551428
1,2022-10-26 13:45:00+00:00,2022-10-26 13:45:00+00:00,0.00,3.25,0.00,0.0,0.0,3.25,Cash,41.829922,-87.672503,41.829922,-87.672503
2,2022-10-14 21:45:00+00:00,2022-10-14 21:45:00+00:00,0.00,30.00,0.00,0.0,0.0,30.00,Cash,42.009623,-87.670167,42.009623,-87.670167
3,2022-10-16 13:00:00+00:00,2022-10-16 13:15:00+00:00,1.71,9.45,2.16,0.0,0.0,11.61,Mobile,41.986712,-87.663416,41.986712,-87.663416
4,2022-10-12 13:15:00+00:00,2022-10-12 13:15:00+00:00,0.00,3.25,0.00,0.0,0.0,3.25,Cash,41.946511,-87.806020,41.946511,-87.806020
...,...,...,...,...,...,...,...,...,...,...,...,...,...
254454,2022-05-01 19:00:00+00:00,2022-05-01 19:00:00+00:00,1.75,8.75,0.00,0.0,0.0,8.75,Cash,41.898332,-87.620763,41.877406,-87.621972
254455,2022-05-05 10:00:00+00:00,2022-05-05 10:00:00+00:00,1.90,8.75,0.00,0.0,0.0,8.75,Cash,41.898332,-87.620763,41.880994,-87.632746
254456,2022-05-13 11:00:00+00:00,2022-05-13 11:45:00+00:00,18.10,46.50,9.60,0.0,1.0,57.60,Credit Card,41.898332,-87.620763,41.979071,-87.903040
254457,2022-05-08 12:30:00+00:00,2022-05-08 12:45:00+00:00,2.31,9.75,2.00,0.0,1.0,13.25,Credit Card,41.898332,-87.620763,41.880994,-87.632746


# Some data engineering stuff


In [3]:
for column in ['trip_start']:
    # starting column
    col = column + '_timestamp'
    # feature engineering
    df[column + '_timestamp'] = pd.to_datetime(df[col]).dt.tz_localize(None)  # converting to datetime
    df[column + '_datetime'] = pd.to_datetime(df[col].dt.strftime('%Y-%m-%d %H:00:00'))  # converting to YYYY-MM-DD HH:00:00
    df[column + '_fhour'] = df[col].dt.strftime('%H:00')
    df[column + '_year'] = df[col].dt.year.astype(np.int64)  # extracting year
    df[column + '_month'] = df[col].dt.month.astype(np.int64) # extracting month
    df[column + '_day'] = df[col].dt.day.astype(np.int64) # extracting day
    df[column + '_time'] = df[col].dt.strftime('%H:%M') # extracting HH:mm
    df[column + '_hour'] = df[col].dt.strftime('%H').astype(np.int64) # extracting hour
    df[column + '_weekday'] = df[col].dt.dayofweek.astype(np.int64) # extracting weekday
    df[column + '_weekend_dummy'] = df[column + '_weekday'].apply(lambda x: 1 if x in [5, 6] else 0) # creating weekend dummy

In [4]:
# iterate trough files and calculate median values grouping by time
# we should group geohashing latitude and longitude but the operation is quite computational expensive, altough even this method take a lot of time..
# in this way we can obtain an unique dataframe containing chicago weather data

dfs = []
for file in cw_files_list:
    _d = v.copernicus_to_dataframe(variables=['10m_u_component_of_wind',
                                           '10m_v_component_of_wind',
                                           '2m_temperature',
                                           'precipitation_type',
                                           'total_precipitation'],
                                      file_path=file)
    _t = _d.groupby('time',as_index=False)[['10m_u_component_of_wind',
                     '10m_v_component_of_wind',
                     '2m_temperature',
                     'precipitation_type',
                     'total_precipitation']].median()
    
    dfs.append(_t)
    
# final chicago weather dataframe
wdf = pd.concat(dfs)

['u10', 'v10', 't2m', 'ptype', 'tp']
['u10', 'v10', 't2m', 'ptype', 'tp']
['u10', 'v10', 't2m', 'ptype', 'tp']


# Data visualization


In [17]:
# pickup density
pu_density = df.groupby(['pickup_latitude', 'pickup_longitude'], as_index=False)['trip_start_timestamp']\
    .count().rename(columns={'trip_start_timestamp': 'count'})\
    .sort_values('count', ascending=False)

fig = px.density_mapbox(pu_density,
                        lat='pickup_latitude',
                        lon='pickup_longitude',
                        z='count',
                        radius=15,
                        center=dict(
                            lat=41.87746956242519,
                            lon=-87.62943539259409
                        ),
                        zoom=9,
                        mapbox_style='open-street-map')
fig.show()