# 11 Tier Part 1

In [6]:
# import libraries
import pandas as pd
import numpy as np
import sklearn
from datetime import datetime
import os
import geopandas as gpd
import matplotlib.pyplot as plt
import folium

In [7]:
data_month_single = '7'
data_month_double = '01'

In [8]:
# load tier trip data
tier_trips = pd.DataFrame(pd.read_csv("../Data/tier_trips_bonn.csv"))
tier_trips

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,tier_trips_start_geometry,tier_trips_end_geometry,tier_trips_start_battery_state,tier_trips_end_battery_state
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (7.046557903289795 50.70532989501953),POINT (7.046558 50.70533),,
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (7.057607 50.737762),POINT (7.057603 50.737762),,
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (7.057603 50.737762),POINT (7.057433 50.737625),,
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (7.057433 50.737625),POINT (7.057462 50.737667),,
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (7.057462 50.737667),POINT (7.057459 50.737667),,
...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (7.156961 50.75051),POINT (7.156948 50.750496),,
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (7.156948 50.750496),POINT (7.156934 50.750484),,
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (7.156934 50.750484),POINT (7.156935 50.750484),,
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (7.156935 50.750484),POINT (7.156932 50.75048),,


In [9]:
# check data types
tier_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4807636 entries, 0 to 4807635
Data columns (total 7 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   tier_trips_id                   int64  
 1   tier_trips_start_time           object 
 2   tier_trips_end_time             object 
 3   tier_trips_start_geometry       object 
 4   tier_trips_end_geometry         object 
 5   tier_trips_start_battery_state  float64
 6   tier_trips_end_battery_state    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 256.8+ MB


In [10]:
gdf = gpd.GeoDataFrame(tier_trips)

In [11]:
# get lat an long values from string
def extract_coordinates(point_str):
    point_str = point_str.replace("POINT (", "").replace(")", "")  # Remove "POINT (" and ")"
    x_start, y_start = point_str.split()  # Split the string by whitespace
    return float(x_start), float(y_start)

In [12]:
# add x and y columns to gdf
gdf[['x_start', 'y_start']] = gdf['tier_trips_start_geometry'].apply(lambda point_str: pd.Series(extract_coordinates(point_str)))


In [13]:
# set geometry
gdf = gpd.GeoDataFrame(gdf, geometry=gpd.points_from_xy(gdf.y_start, gdf.x_start), crs="EPSG:4326")

In [14]:
# drop columns
gdf.drop(columns=['tier_trips_start_geometry', 'tier_trips_start_battery_state', 'tier_trips_end_battery_state', 'x_start', 'y_start'], inplace=True)

In [15]:
# rename
gdf = gdf.rename(columns={'geometry': 'start_location'})

In [16]:
gdf[['x_end', 'y_end']] = gdf['tier_trips_end_geometry'].apply(lambda point_str: pd.Series(extract_coordinates(point_str)))

In [17]:
gdf = gpd.GeoDataFrame(gdf, geometry=gpd.points_from_xy(gdf.y_end, gdf.x_end), crs="EPSG:4326")

In [18]:
gdf = gdf.rename(columns={'geometry': 'end_location'})

In [19]:
gdf.drop(columns=['x_end', 'y_end'], inplace=True)

In [20]:
gdf.drop(columns=['tier_trips_end_geometry'], inplace=True)

In [21]:
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656)
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760)
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743)
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746)
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746)
...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695)
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693)
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693)
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693)


In [22]:
# convert to datetime
gdf['tier_trips_start_time'] = pd.to_datetime(gdf['tier_trips_start_time'])
gdf['tier_trips_end_time'] = pd.to_datetime(gdf['tier_trips_end_time'])

In [23]:
#check if all month are included in dataset
months = gdf['tier_trips_start_time'].dt.month.unique()
print(months)

[12  1  3  2  9 11  7  6  8  4  5]


In [24]:
# set new column with vehicle time - important if bikes are added later on 
gdf['vehicle type'] = 'e-scooter'

In [25]:
# sort dataframe by start times
gdf_sorted= gdf.sort_values('tier_trips_start_time')
gdf_sorted

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type
879715,497177281,2023-01-01 00:05:00,2023-01-01 00:10:00,POINT (50.70719 7.11625),POINT (50.70720 7.11626),e-scooter
828255,497115854,2023-01-01 00:05:00,2023-01-01 00:10:00,POINT (50.73803 7.10418),POINT (50.73803 7.10419),e-scooter
850172,497142938,2023-01-01 00:05:00,2023-01-01 00:15:00,POINT (50.71849 7.08594),POINT (50.72638 7.09367),e-scooter
908450,497213521,2023-01-01 00:05:00,2023-01-01 00:10:00,POINT (50.71442 7.16382),POINT (50.71444 7.16383),e-scooter
802021,497085737,2023-01-01 00:05:00,2023-01-01 00:10:00,POINT (50.68890 7.16929),POINT (50.68889 7.16920),e-scooter
...,...,...,...,...,...,...
799499,1092441752,2024-03-31 23:50:00,2024-03-31 23:55:00,POINT (50.68398 7.15888),POINT (50.68398 7.15888),e-scooter
779950,1092422203,2024-03-31 23:50:00,2024-03-31 23:55:00,POINT (50.67822 7.16737),POINT (50.67818 7.16738),e-scooter
790007,1092432260,2024-03-31 23:50:00,2024-03-31 23:55:00,POINT (50.73136 7.09753),POINT (50.73128 7.09742),e-scooter
657926,1092395044,2024-03-31 23:50:00,2024-03-31 23:55:00,POINT (50.65916 7.18279),POINT (50.65918 7.18285),e-scooter


In [26]:
# Check for duplicates in the 'tier' DataFrame
duplicates_exist = gdf.duplicated().any()

if duplicates_exist:
    print("Duplicates exist in the DataFrame")
else:
    print("No duplicates found in the DataFrame")

No duplicates found in the DataFrame


In [27]:
# check if trips are starting and ending at the same location
gdf_same_location = gdf[gdf['start_location'] == gdf['end_location']]
gdf_same_location

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type


There are no rows with exact the same start and end location

In [28]:
#check if there are trips with the same start and end time
gdf_same_time = gdf[gdf['tier_trips_start_time'] == gdf['tier_trips_end_time']]
gdf_same_time

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type
10756,1013973993,2023-12-31 21:15:00,2023-12-31 21:15:00,POINT (50.70521 7.11109),POINT (50.70521 7.11109),e-scooter
22755,1013985511,2023-09-03 10:20:00,2023-09-03 10:20:00,POINT (50.73751 7.10380),POINT (50.73750 7.10390),e-scooter
124302,1028488653,2023-08-29 15:05:00,2023-08-29 15:05:00,POINT (50.75104 7.12923),POINT (50.75103 7.12924),e-scooter
133433,1028493228,2024-01-14 12:25:00,2024-01-14 12:25:00,POINT (50.66806 7.18915),POINT (50.66806 7.18915),e-scooter
160818,1043732559,2024-01-29 07:05:00,2024-01-29 07:05:00,POINT (50.75166 7.11982),POINT (50.75166 7.11982),e-scooter
...,...,...,...,...,...,...
4186568,944097047,2023-09-30 22:35:00,2023-09-30 22:35:00,POINT (50.72536 7.13492),POINT (50.72535 7.13492),e-scooter
4595461,981983063,2023-11-29 20:55:00,2023-11-29 20:55:00,POINT (50.66740 7.16187),POINT (50.66740 7.16188),e-scooter
4604202,981991804,2023-11-24 16:50:00,2023-11-24 16:50:00,POINT (50.75415 7.14698),POINT (50.75415 7.14698),e-scooter
4604203,981991805,2023-11-24 16:50:00,2023-11-24 16:50:00,POINT (50.75415 7.14698),POINT (50.75415 7.14698),e-scooter


There are no rows, where the trip start time and end time are identical

In [29]:
# calculate trip duration
gdf['trip_duration'] = gdf['tier_trips_end_time'] - gdf['tier_trips_start_time']

In [30]:
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656),e-scooter,0 days 00:05:00
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760),e-scooter,0 days 00:05:00
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743),e-scooter,0 days 00:05:00
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00
...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695),e-scooter,0 days 00:05:00
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00


In [31]:
#look at trip durations
trip_durations = gdf['tier_trips_end_time'] - gdf['tier_trips_start_time']
unique_durations = trip_durations.unique()
print(unique_durations)

<TimedeltaArray>
[ '0 days 00:05:00',  '0 days 00:10:00',  '0 days 00:50:00',
  '1 days 08:45:00',  '0 days 00:15:00',  '0 days 01:00:00',
  '0 days 00:30:00',  '0 days 00:25:00',  '0 days 00:20:00',
  '0 days 07:45:00',
 ...
  '6 days 08:30:00', '37 days 13:20:00',  '3 days 22:25:00',
  '5 days 13:40:00', '17 days 22:50:00',  '3 days 17:40:00',
 '28 days 12:55:00',  '6 days 00:25:00', '21 days 14:45:00',
 '10 days 14:30:00']
Length: 5510, dtype: timedelta64[ns]


We can see, that the shortest trips are 5 minutes long, while the longest trips are over 10 days long

In [32]:
# only keep trips with rental duration of less than 2 hours
gdf = gdf[gdf['trip_duration'] <= pd.Timedelta(hours=2)]


In [33]:
# only keep rentals longer than 1 minute
gdf = gdf[gdf['trip_duration'] >= pd.Timedelta(minutes=1)]

In [34]:
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656),e-scooter,0 days 00:05:00
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760),e-scooter,0 days 00:05:00
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743),e-scooter,0 days 00:05:00
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00
...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695),e-scooter,0 days 00:05:00
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00


In [35]:
#calculate driven distance
# code from ChatGPT
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = math.sin(dlat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = 6371 * c  # Earth's radius in kilometers
    return distance

# Calculate driven distance for each trip
gdf['trip_distance_kilometers'] = gdf.apply(lambda row: haversine_distance(row['start_location'].y, row['start_location'].x, row['end_location'].y, row['end_location'].x), axis=1)

gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656),e-scooter,0 days 00:05:00,0.000016
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760),e-scooter,0 days 00:05:00,0.000445
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743),e-scooter,0 days 00:05:00,0.024205
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.005646
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.000334
...,...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695),e-scooter,0 days 00:05:00,0.002116
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.002044
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000111
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000553


In [36]:
# Calculate driven distance in meters
gdf['trip_distance_meters'] = gdf['trip_distance_kilometers']*1000

In [37]:
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656),e-scooter,0 days 00:05:00,0.000016,0.015807
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760),e-scooter,0 days 00:05:00,0.000445,0.444780
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743),e-scooter,0 days 00:05:00,0.024205,24.205188
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.005646,5.646219
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.000334,0.333585
...,...,...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695),e-scooter,0 days 00:05:00,0.002116,2.115504
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.002044,2.043583
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000111,0.111195
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000553,0.553206


In [38]:
# delete all trips with a distance of more than 15 km according to Reck et al. (2021)
gdf = gdf[gdf['trip_distance_meters'] <= 15000]

In [39]:
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
0,1013935111,2023-12-31 23:55:00,2024-01-01 00:00:00,POINT (50.70533 7.04656),POINT (50.70533 7.04656),e-scooter,0 days 00:05:00,0.000016,0.015807
1,1013963589,2024-01-04 04:20:00,2024-01-04 04:25:00,POINT (50.73776 7.05761),POINT (50.73776 7.05760),e-scooter,0 days 00:05:00,0.000445,0.444780
2,1013963590,2024-01-05 19:10:00,2024-01-05 19:15:00,POINT (50.73776 7.05760),POINT (50.73763 7.05743),e-scooter,0 days 00:05:00,0.024205,24.205188
3,1013963591,2024-01-05 19:30:00,2024-01-05 19:35:00,POINT (50.73763 7.05743),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.005646,5.646219
4,1013963592,2024-01-05 19:35:00,2024-01-05 19:40:00,POINT (50.73767 7.05746),POINT (50.73767 7.05746),e-scooter,0 days 00:05:00,0.000334,0.333585
...,...,...,...,...,...,...,...,...,...
4807631,1006547272,2023-12-29 01:55:00,2023-12-29 02:00:00,POINT (50.75051 7.15696),POINT (50.75050 7.15695),e-scooter,0 days 00:05:00,0.002116,2.115504
4807632,1006547273,2023-12-29 02:00:00,2023-12-29 02:05:00,POINT (50.75050 7.15695),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.002044,2.043583
4807633,1006547274,2023-12-29 02:05:00,2023-12-29 02:10:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000111,0.111195
4807634,1006547275,2023-12-29 02:10:00,2023-12-29 02:15:00,POINT (50.75048 7.15693),POINT (50.75048 7.15693),e-scooter,0 days 00:05:00,0.000553,0.553206


In [40]:
# only keep trips with distance of more than 100 meters
gdf = gdf[gdf['trip_distance_meters'] >= 100]
gdf

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
17,1013963605,2024-01-03 11:15:00,2024-01-03 12:05:00,POINT (50.70353 7.05537),POINT (50.69986 7.05557),e-scooter,0 days 00:50:00,0.405942,405.942039
18,1013963606,2024-01-05 12:35:00,2024-01-05 12:40:00,POINT (50.69986 7.05557),POINT (50.70002 7.05410),e-scooter,0 days 00:05:00,0.164455,164.455390
30,1013963618,2024-01-08 00:10:00,2024-01-08 00:25:00,POINT (50.69900 7.05433),POINT (50.68361 7.07479),e-scooter,0 days 00:15:00,2.838183,2838.183051
35,1013963623,2024-01-08 19:05:00,2024-01-08 19:15:00,POINT (50.68352 7.07513),POINT (50.68842 7.06835),e-scooter,0 days 00:10:00,0.928300,928.300419
43,1013963631,2024-01-09 08:55:00,2024-01-09 09:00:00,POINT (50.68864 7.06831),POINT (50.68497 7.07008),e-scooter,0 days 00:05:00,0.449935,449.935302
...,...,...,...,...,...,...,...,...,...
4807582,1006547223,2023-12-28 15:15:00,2023-12-28 15:20:00,POINT (50.72758 7.14229),POINT (50.72803 7.14323),e-scooter,0 days 00:05:00,0.115253,115.252520
4807597,1006547238,2023-12-28 16:00:00,2023-12-28 16:10:00,POINT (50.74919 7.15428),POINT (50.74152 7.15354),e-scooter,0 days 00:10:00,0.849810,849.809943
4807604,1006547245,2023-12-30 18:35:00,2023-12-30 19:00:00,POINT (50.74147 7.15350),POINT (50.73831 7.11442),e-scooter,0 days 00:25:00,4.358833,4358.832576
4807609,1006547250,2023-12-31 00:10:00,2023-12-31 00:40:00,POINT (50.73841 7.11403),POINT (50.70677 7.11513),e-scooter,0 days 00:30:00,3.492918,3492.918454


In [41]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 427029 entries, 17 to 4807616
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype          
---  ------                    --------------   -----          
 0   tier_trips_id             427029 non-null  int64          
 1   tier_trips_start_time     427029 non-null  datetime64[ns] 
 2   tier_trips_end_time       427029 non-null  datetime64[ns] 
 3   start_location            427029 non-null  geometry       
 4   end_location              427029 non-null  geometry       
 5   vehicle type              427029 non-null  object         
 6   trip_duration             427029 non-null  timedelta64[ns]
 7   trip_distance_kilometers  427029 non-null  float64        
 8   trip_distance_meters      427029 non-null  float64        
dtypes: datetime64[ns](2), float64(2), geometry(2), int64(1), object(1), timedelta64[ns](1)
memory usage: 32.6+ MB


In [42]:
gdf.isnull().sum()

tier_trips_id               0
tier_trips_start_time       0
tier_trips_end_time         0
start_location              0
end_location                0
vehicle type                0
trip_duration               0
trip_distance_kilometers    0
trip_distance_meters        0
dtype: int64

In [43]:

# Only 2023 data
gdf_2023 = gdf[gdf['tier_trips_start_time'] <= '2023-12-31']
gdf_2023 = gdf_2023[gdf_2023['tier_trips_end_time'] <= '2023-12-31']
gdf_2023

Unnamed: 0,tier_trips_id,tier_trips_start_time,tier_trips_end_time,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
801621,497085150,2023-01-02 16:10:00,2023-01-02 16:20:00,POINT (50.72230 7.15637),POINT (50.71792 7.15981),e-scooter,0 days 00:10:00,0.617222,617.221763
801629,497085158,2023-01-02 21:25:00,2023-01-02 21:40:00,POINT (50.71801 7.15996),POINT (50.72203 7.15509),e-scooter,0 days 00:15:00,0.699559,699.558708
801631,497085160,2023-01-02 21:50:00,2023-01-02 22:00:00,POINT (50.72208 7.15508),POINT (50.71844 7.16095),e-scooter,0 days 00:10:00,0.766417,766.417469
801637,497085166,2023-01-03 06:50:00,2023-01-03 07:40:00,POINT (50.71841 7.16090),POINT (50.76291 7.07626),e-scooter,0 days 00:50:00,10.615185,10615.184930
801643,497085172,2023-01-03 14:50:00,2023-01-03 15:15:00,POINT (50.76284 7.07624),POINT (50.73918 7.12050),e-scooter,0 days 00:25:00,5.571567,5571.567347
...,...,...,...,...,...,...,...,...,...
4807552,1006547193,2023-12-29 10:30:00,2023-12-29 11:00:00,POINT (50.71135 7.16819),POINT (50.74229 7.09758),e-scooter,0 days 00:30:00,8.561807,8561.806757
4807558,1006547199,2023-12-29 11:35:00,2023-12-29 12:05:00,POINT (50.74206 7.09716),POINT (50.71347 7.16496),e-scooter,0 days 00:30:00,8.172812,8172.812192
4807582,1006547223,2023-12-28 15:15:00,2023-12-28 15:20:00,POINT (50.72758 7.14229),POINT (50.72803 7.14323),e-scooter,0 days 00:05:00,0.115253,115.252520
4807597,1006547238,2023-12-28 16:00:00,2023-12-28 16:10:00,POINT (50.74919 7.15428),POINT (50.74152 7.15354),e-scooter,0 days 00:10:00,0.849810,849.809943


In [44]:
gdf_2023.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 354421 entries, 801621 to 4807604
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype          
---  ------                    --------------   -----          
 0   tier_trips_id             354421 non-null  int64          
 1   tier_trips_start_time     354421 non-null  datetime64[ns] 
 2   tier_trips_end_time       354421 non-null  datetime64[ns] 
 3   start_location            354421 non-null  geometry       
 4   end_location              354421 non-null  geometry       
 5   vehicle type              354421 non-null  object         
 6   trip_duration             354421 non-null  timedelta64[ns]
 7   trip_distance_kilometers  354421 non-null  float64        
 8   trip_distance_meters      354421 non-null  float64        
dtypes: datetime64[ns](2), float64(2), geometry(2), int64(1), object(1), timedelta64[ns](1)
memory usage: 27.0+ MB


In [45]:
gdf_2023.to_parquet('../Data/11_tier_part1.parquet') 