# 13: Nextbike part 1

*Note: The Nextbike Notebooks are identical to the Tier notebooks*

In this notebook, we will prepare the Nextbike dataset. This includes
- checking for duplicates
- checking for anomalies in rides e.g. same start and end coordinates, checking same start and end times
- calcualte trip durations
- calculate trip distances
- create "vehicle type" colum

In [1]:
# import libraries
import pandas as pd
import numpy as np
import sklearn
from datetime import datetime
import os
import geopandas as gpd
import matplotlib.pyplot as plt
import folium

In [2]:
data_month_single = '7'
data_month_double = '07'

## load data

In [3]:
# load tier trip data
nextbike = pd.DataFrame(pd.read_csv("/Volumes/T7/Master/Raw Data/nextbike_trips_bonn.csv"))
nextbike

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_geometry,nextbike_trips_end_geometry,nextbike_trips_start_at_station,nextbike_trips_end_at_station
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,POINT (7.041189193725586 50.74813461303711),POINT (7.041189 50.748135),False,False
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,POINT (7.041189 50.748135),POINT (7.052102 50.74607),False,False
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,POINT (7.052102 50.74607),POINT (7.070557 50.73197),False,False
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,POINT (7.070557 50.73197),POINT (7.070447 50.731934),False,False
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,POINT (7.070447 50.731934),POINT (7.062779 50.72267),False,False
...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,POINT (7.092047 50.726757),POINT (7.091891 50.726734),False,False
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,POINT (7.091891 50.726734),POINT (7.0717 50.718464),False,False
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,POINT (7.0717 50.718464),POINT (7.071739 50.71841),False,False
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,POINT (7.071739 50.71841),POINT (7.07183 50.718456),False,False


In [4]:
# check data types
nextbike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2911125 entries, 0 to 2911124
Data columns (total 7 columns):
 #   Column                           Dtype 
---  ------                           ----- 
 0   nextbike_trips_id                int64 
 1   nextbike_trips_start_time        object
 2   nextbike_trips_end_time          object
 3   nextbike_trips_start_geometry    object
 4   nextbike_trips_end_geometry      object
 5   nextbike_trips_start_at_station  bool  
 6   nextbike_trips_end_at_station    bool  
dtypes: bool(2), int64(1), object(4)
memory usage: 116.6+ MB


In [5]:
gdf = gpd.GeoDataFrame(nextbike)

In [6]:
# get lat an long values from string
def extract_coordinates(point_str):
    point_str = point_str.replace("POINT (", "").replace(")", "")  # Remove "POINT (" and ")"
    x_start, y_start = point_str.split()  # Split the string by whitespace
    return float(x_start), float(y_start)

In [7]:
# add x and y columns to gdf
gdf[['x_start', 'y_start']] = gdf['nextbike_trips_start_geometry'].apply(lambda point_str: pd.Series(extract_coordinates(point_str)))


In [8]:
# set geometry
gdf = gpd.GeoDataFrame(gdf, geometry=gpd.points_from_xy(gdf.y_start, gdf.x_start), crs="EPSG:4326")

In [9]:
# drop columns
gdf.drop(columns=['nextbike_trips_start_geometry', 'x_start', 'y_start'], inplace=True)

In [10]:
# rename
gdf = gdf.rename(columns={'geometry': 'start_location'})

In [11]:
gdf[['x_end', 'y_end']] = gdf['nextbike_trips_end_geometry'].apply(lambda point_str: pd.Series(extract_coordinates(point_str)))

In [12]:
gdf = gpd.GeoDataFrame(gdf, geometry=gpd.points_from_xy(gdf.y_end, gdf.x_end), crs="EPSG:4326")

In [13]:
gdf = gdf.rename(columns={'geometry': 'end_location'})

In [14]:
gdf.drop(columns=['x_end', 'y_end'], inplace=True)

In [15]:
gdf.drop(columns=['nextbike_trips_end_geometry'], inplace=True)

In [16]:
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119)
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210)
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056)
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045)
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278)
...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189)
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170)
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174)
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183)


In [17]:
# convert to datetime
gdf['nextbike_trips_start_time'] = pd.to_datetime(gdf['nextbike_trips_start_time'])
gdf['nextbike_trips_end_time'] = pd.to_datetime(gdf['nextbike_trips_end_time'])

In [18]:
#check if all month are included in dataset
months = gdf['nextbike_trips_start_time'].dt.month.unique()
print(months)

[ 2  3 12  1 10 11  4  5  6  7  8  9]


In [19]:
# set new column with vehicle time - important if bikes are added later on 
gdf['vehicle type'] = 'bike'

In [20]:
# sort dataframe by start times
gdf_sorted= gdf.sort_values('nextbike_trips_start_time')
gdf_sorted

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type
215401,21395455,2023-01-01 00:01:00,2023-01-01 00:06:00,False,False,POINT (50.72434 7.06619),POINT (50.72424 7.06615),bike
215360,21395444,2023-01-01 00:01:00,2023-01-01 00:06:00,False,False,POINT (50.70086 7.15791),POINT (50.70104 7.15789),bike
215358,21395442,2023-01-01 00:01:00,2023-01-01 00:06:00,False,False,POINT (50.74684 7.10197),POINT (50.74703 7.10190),bike
215357,21395441,2023-01-01 00:01:00,2023-01-01 00:16:00,False,False,POINT (50.73756 7.08446),POINT (50.74553 7.06709),bike
215356,21395440,2023-01-01 00:01:00,2023-01-01 00:26:00,False,False,POINT (50.75950 7.12540),POINT (50.74019 7.11529),bike
...,...,...,...,...,...,...,...,...
27176,59258232,2024-03-31 23:51:00,2024-03-31 23:56:00,False,False,POINT (50.73521 7.06793),POINT (50.73502 7.06800),bike
123380,59325319,2024-03-31 23:51:00,2024-03-31 23:56:00,False,False,POINT (50.73911 7.10428),POINT (50.73919 7.10423),bike
125342,59381105,2024-03-31 23:51:00,2024-03-31 23:56:00,False,False,POINT (50.73769 7.10262),POINT (50.73728 7.10271),bike
165451,59401146,2024-03-31 23:51:00,2024-03-31 23:56:00,False,False,POINT (50.67583 7.17318),POINT (50.67571 7.17319),bike


In [21]:
# Check for duplicates in the 'tier' DataFrame
duplicates_exist = gdf.duplicated().any()

if duplicates_exist:
    print("Duplicates exist in the DataFrame")
else:
    print("No duplicates found in the DataFrame")

No duplicates found in the DataFrame


In [22]:
# check if trips are starting and ending at the same location
gdf_same_location = gdf[gdf['start_location'] == gdf['end_location']]
gdf_same_location

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type


In [23]:
# only keep entries where start_location and end_location are not identical
gdf = gdf[gdf['start_location'] != gdf['end_location']]
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119),bike
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045),bike
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike
...,...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189),bike
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170),bike
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174),bike
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183),bike


In [24]:
#check if there are trips with the same start and end time
gdf_same_time = gdf[gdf['nextbike_trips_start_time'] == gdf['nextbike_trips_end_time']]
gdf_same_time

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type
167744,59345787,2024-02-28 19:56:00,2024-02-28 19:56:00,False,True,POINT (50.72538 7.08078),POINT (50.72702 7.08124),bike
901995,27069920,2023-02-27 15:51:00,2023-02-27 15:51:00,True,False,POINT (50.75523 7.07653),POINT (50.75384 7.07697),bike
1120182,32362052,2023-05-17 06:46:00,2023-05-17 06:46:00,False,False,POINT (50.75367 7.16374),POINT (50.75371 7.16378),bike
1541049,38089964,2023-06-05 16:46:00,2023-06-05 16:46:00,True,False,POINT (50.75885 7.04705),POINT (50.75831 7.04636),bike
2208034,50814528,2023-10-05 18:06:00,2023-10-05 18:06:00,True,False,POINT (50.73952 7.11777),POINT (50.73901 7.11853),bike
2631046,55335009,2023-12-31 22:21:00,2023-12-31 22:21:00,False,False,POINT (50.70346 7.15924),POINT (50.70343 7.15929),bike
2696183,55403394,2023-12-31 23:01:00,2023-12-31 23:01:00,False,True,POINT (50.74912 7.04625),POINT (50.74976 7.04615),bike
2889766,53245048,2023-11-27 13:06:00,2023-11-27 13:06:00,False,True,POINT (50.72966 7.10910),POINT (50.73009 7.10828),bike


In [27]:
# only keep entries where start and end tieme are not identical
gdf = gdf[gdf['nextbike_trips_start_time'] != gdf['nextbike_trips_end_time']]
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119),bike
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045),bike
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike
...,...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189),bike
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170),bike
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174),bike
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183),bike


In [28]:
# calculate trip duration
gdf['trip_duration'] = gdf['nextbike_trips_end_time'] - gdf['nextbike_trips_start_time']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [29]:
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119),bike,0 days 00:06:00
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike,0 days 00:10:00
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike,0 days 00:20:00
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045),bike,0 days 00:05:00
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike,0 days 00:30:00
...,...,...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189),bike,0 days 00:05:00
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170),bike,0 days 00:15:00
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174),bike,0 days 00:05:00
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183),bike,0 days 00:05:00


In [30]:
#look at trip durations
trip_durations = gdf['nextbike_trips_end_time'] - gdf['nextbike_trips_start_time']
unique_durations = trip_durations.unique()
print(unique_durations)

<TimedeltaArray>
[ '0 days 00:06:00',  '0 days 00:10:00',  '0 days 00:20:00',
  '0 days 00:05:00',  '0 days 00:30:00',  '0 days 14:45:00',
  '0 days 00:15:00',  '7 days 11:40:00',  '0 days 19:25:00',
  '0 days 00:16:00',
 ...
  '1 days 04:27:00', '69 days 22:45:00',  '4 days 06:15:00',
 '16 days 00:35:00',  '5 days 20:37:00',  '6 days 22:00:00',
 '14 days 08:45:00', '18 days 23:57:00', '12 days 07:00:00',
 '27 days 16:30:00']
Length: 4398, dtype: timedelta64[ns]


We can see, that the shortest trips are 5 minutes long, while the longest trips are over 20 days long

In [31]:
# only keep trips with rental duration of less than 2 hours
gdf = gdf[gdf['trip_duration'] <= pd.Timedelta(hours=2)]


In [32]:
# only keep rentals longer than 1 minute
gdf = gdf[gdf['trip_duration'] >= pd.Timedelta(minutes=1)]

In [33]:
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119),bike,0 days 00:06:00
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike,0 days 00:10:00
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike,0 days 00:20:00
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045),bike,0 days 00:05:00
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike,0 days 00:30:00
...,...,...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189),bike,0 days 00:05:00
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170),bike,0 days 00:15:00
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174),bike,0 days 00:05:00
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183),bike,0 days 00:05:00


In [34]:
#calculate driven distance
# code from ChatGPT
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = math.sin(dlat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = 6371 * c  # Earth's radius in kilometers
    return distance

# Calculate driven distance for each trip
gdf['trip_distance_kilometers'] = gdf.apply(lambda row: haversine_distance(row['start_location'].y, row['start_location'].x, row['end_location'].y, row['end_location'].x), axis=1)

gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers
0,59231056,2024-02-29 23:56:00,2024-03-01 00:02:00,False,False,POINT (50.74813 7.04119),POINT (50.74813 7.04119),bike,0 days 00:06:00,0.000048
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike,0 days 00:10:00,1.234682
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike,0 days 00:20:00,2.575291
3,59231059,2024-03-01 11:11:00,2024-03-01 11:16:00,False,False,POINT (50.73197 7.07056),POINT (50.73193 7.07045),bike,0 days 00:05:00,0.012860
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike,0 days 00:30:00,1.331190
...,...,...,...,...,...,...,...,...,...,...
2911120,55593866,2024-02-02 09:21:00,2024-02-02 09:26:00,False,False,POINT (50.72676 7.09205),POINT (50.72673 7.09189),bike,0 days 00:05:00,0.017531
2911121,55593867,2024-02-02 10:56:00,2024-02-02 11:11:00,False,False,POINT (50.72673 7.09189),POINT (50.71846 7.07170),bike,0 days 00:15:00,2.423513
2911122,55593868,2024-02-02 11:11:00,2024-02-02 11:16:00,False,False,POINT (50.71846 7.07170),POINT (50.71841 7.07174),bike,0 days 00:05:00,0.007370
2911123,55593869,2024-02-02 14:16:00,2024-02-02 14:21:00,False,False,POINT (50.71841 7.07174),POINT (50.71846 7.07183),bike,0 days 00:05:00,0.011321


In [35]:
# Calculate driven distance in meters
gdf['trip_distance_meters'] = gdf['trip_distance_kilometers']*1000

In [36]:
# delete all trips with a distance of more than 15 km according to Reck et al. (2021)
gdf = gdf[gdf['trip_distance_meters'] <= 15000]

In [37]:
# only keep trips with distance of more than 100 meters
gdf = gdf[gdf['trip_distance_meters'] >= 100]
gdf

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
1,59231057,2024-03-01 07:11:00,2024-03-01 07:21:00,False,False,POINT (50.74813 7.04119),POINT (50.74607 7.05210),bike,0 days 00:10:00,1.234682,1234.682442
2,59231058,2024-03-01 10:51:00,2024-03-01 11:11:00,False,False,POINT (50.74607 7.05210),POINT (50.73197 7.07056),bike,0 days 00:20:00,2.575291,2575.291224
4,59231060,2024-03-01 11:46:00,2024-03-01 12:16:00,False,False,POINT (50.73193 7.07045),POINT (50.72267 7.06278),bike,0 days 00:30:00,1.331190,1331.189668
9,59231065,2024-03-01 17:31:00,2024-03-01 18:01:00,False,False,POINT (50.72264 7.06276),POINT (50.73168 7.09631),bike,0 days 00:30:00,3.861848,3861.848449
11,59231067,2024-03-01 18:21:00,2024-03-01 18:41:00,False,False,POINT (50.73174 7.09640),POINT (50.72510 7.06968),bike,0 days 00:20:00,3.059976,3059.976315
...,...,...,...,...,...,...,...,...,...,...,...
2911112,55593858,2024-02-02 06:36:00,2024-02-02 06:51:00,False,False,POINT (50.71994 7.09390),POINT (50.72931 7.09763),bike,0 days 00:15:00,1.114556,1114.556463
2911114,55593860,2024-02-02 07:41:00,2024-02-02 08:21:00,False,False,POINT (50.72932 7.09759),POINT (50.71917 7.10460),bike,0 days 00:40:00,1.364884,1364.884013
2911115,55593861,2024-02-02 08:36:00,2024-02-02 08:46:00,False,False,POINT (50.71917 7.10460),POINT (50.72667 7.08653),bike,0 days 00:10:00,2.173216,2173.215782
2911117,55593863,2024-02-02 08:51:00,2024-02-02 09:01:00,False,False,POINT (50.72660 7.08649),POINT (50.72661 7.09170),bike,0 days 00:10:00,0.579550,579.550321


In [38]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 940538 entries, 1 to 2911121
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype          
---  ------                           --------------   -----          
 0   nextbike_trips_id                940538 non-null  int64          
 1   nextbike_trips_start_time        940538 non-null  datetime64[ns] 
 2   nextbike_trips_end_time          940538 non-null  datetime64[ns] 
 3   nextbike_trips_start_at_station  940538 non-null  bool           
 4   nextbike_trips_end_at_station    940538 non-null  bool           
 5   start_location                   940538 non-null  geometry       
 6   end_location                     940538 non-null  geometry       
 7   vehicle type                     940538 non-null  object         
 8   trip_duration                    940538 non-null  timedelta64[ns]
 9   trip_distance_kilometers         940538 non-null  float64        
 10  trip_distance_meters        

In [39]:
gdf.isnull().sum()

nextbike_trips_id                  0
nextbike_trips_start_time          0
nextbike_trips_end_time            0
nextbike_trips_start_at_station    0
nextbike_trips_end_at_station      0
start_location                     0
end_location                       0
vehicle type                       0
trip_duration                      0
trip_distance_kilometers           0
trip_distance_meters               0
dtype: int64

In [40]:

# Only 2023 data
gdf_2023 = gdf[gdf['nextbike_trips_start_time'] <= '2023-12-31']
gdf_2023 = gdf_2023[gdf_2023['nextbike_trips_end_time'] <= '2023-12-31']
gdf_2023

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
215356,21395440,2023-01-01 00:01:00,2023-01-01 00:26:00,False,False,POINT (50.75950 7.12540),POINT (50.74019 7.11529),bike,0 days 00:25:00,2.408602,2408.601747
215357,21395441,2023-01-01 00:01:00,2023-01-01 00:16:00,False,False,POINT (50.73756 7.08446),POINT (50.74553 7.06709),bike,0 days 00:15:00,2.122154,2122.154442
215371,21413775,2023-01-04 22:56:00,2023-01-04 23:11:00,False,False,POINT (50.70325 7.15425),POINT (50.69928 7.16349),bike,0 days 00:15:00,1.117285,1117.284795
215373,21413777,2023-01-04 23:01:00,2023-01-04 23:21:00,False,False,POINT (50.72477 7.06972),POINT (50.71345 7.11151),bike,0 days 00:20:00,4.812307,4812.306771
215374,21413778,2023-01-04 23:01:00,2023-01-04 23:11:00,False,True,POINT (50.73168 7.10039),POINT (50.72495 7.09908),bike,0 days 00:10:00,0.756691,756.691454
...,...,...,...,...,...,...,...,...,...,...,...
2899324,53247874,2023-12-15 15:16:00,2023-12-15 15:26:00,False,True,POINT (50.69743 7.12931),POINT (50.71059 7.12687),bike,0 days 00:10:00,1.476760,1476.759686
2899327,53247875,2023-12-15 15:31:00,2023-12-15 15:36:00,True,False,POINT (50.71059 7.12687),POINT (50.70707 7.12412),bike,0 days 00:05:00,0.494814,494.813946
2899336,53247878,2023-12-15 17:41:00,2023-12-15 17:51:00,False,True,POINT (50.70710 7.12413),POINT (50.71059 7.12687),bike,0 days 00:10:00,0.491459,491.458771
2899339,53247879,2023-12-15 18:16:00,2023-12-15 18:31:00,True,False,POINT (50.71059 7.12687),POINT (50.69735 7.12717),bike,0 days 00:15:00,1.461322,1461.321822


In [41]:
gdf_2023.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 771657 entries, 215356 to 2899345
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype          
---  ------                           --------------   -----          
 0   nextbike_trips_id                771657 non-null  int64          
 1   nextbike_trips_start_time        771657 non-null  datetime64[ns] 
 2   nextbike_trips_end_time          771657 non-null  datetime64[ns] 
 3   nextbike_trips_start_at_station  771657 non-null  bool           
 4   nextbike_trips_end_at_station    771657 non-null  bool           
 5   start_location                   771657 non-null  geometry       
 6   end_location                     771657 non-null  geometry       
 7   vehicle type                     771657 non-null  object         
 8   trip_duration                    771657 non-null  timedelta64[ns]
 9   trip_distance_kilometers         771657 non-null  float64        
 10  trip_distance_meters   

In [42]:
gdf_2023.to_parquet('/Volumes/T7/Master/Processed Data/'+data_month_double+'/14_nextbike_part1.parquet') 

In [43]:
gdf_2023

Unnamed: 0,nextbike_trips_id,nextbike_trips_start_time,nextbike_trips_end_time,nextbike_trips_start_at_station,nextbike_trips_end_at_station,start_location,end_location,vehicle type,trip_duration,trip_distance_kilometers,trip_distance_meters
215356,21395440,2023-01-01 00:01:00,2023-01-01 00:26:00,False,False,POINT (50.75950 7.12540),POINT (50.74019 7.11529),bike,0 days 00:25:00,2.408602,2408.601747
215357,21395441,2023-01-01 00:01:00,2023-01-01 00:16:00,False,False,POINT (50.73756 7.08446),POINT (50.74553 7.06709),bike,0 days 00:15:00,2.122154,2122.154442
215371,21413775,2023-01-04 22:56:00,2023-01-04 23:11:00,False,False,POINT (50.70325 7.15425),POINT (50.69928 7.16349),bike,0 days 00:15:00,1.117285,1117.284795
215373,21413777,2023-01-04 23:01:00,2023-01-04 23:21:00,False,False,POINT (50.72477 7.06972),POINT (50.71345 7.11151),bike,0 days 00:20:00,4.812307,4812.306771
215374,21413778,2023-01-04 23:01:00,2023-01-04 23:11:00,False,True,POINT (50.73168 7.10039),POINT (50.72495 7.09908),bike,0 days 00:10:00,0.756691,756.691454
...,...,...,...,...,...,...,...,...,...,...,...
2899324,53247874,2023-12-15 15:16:00,2023-12-15 15:26:00,False,True,POINT (50.69743 7.12931),POINT (50.71059 7.12687),bike,0 days 00:10:00,1.476760,1476.759686
2899327,53247875,2023-12-15 15:31:00,2023-12-15 15:36:00,True,False,POINT (50.71059 7.12687),POINT (50.70707 7.12412),bike,0 days 00:05:00,0.494814,494.813946
2899336,53247878,2023-12-15 17:41:00,2023-12-15 17:51:00,False,True,POINT (50.70710 7.12413),POINT (50.71059 7.12687),bike,0 days 00:10:00,0.491459,491.458771
2899339,53247879,2023-12-15 18:16:00,2023-12-15 18:31:00,True,False,POINT (50.71059 7.12687),POINT (50.69735 7.12717),bike,0 days 00:15:00,1.461322,1461.321822
