In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt
from matplotlib import colormaps

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
   for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/tfl-cycle-hire-data/JourneyDataExtract27Dec2017-05Jan2021.csv


# Creating data frame & assessing data

Sourced from the TfL CycleHire Scheme, I brought the data into a pandas dataframe and specified some datatypes that would help process the data later on.

In [2]:
csv_path = "../input/tfl-cycle-hire-data/JourneyDataExtract27Dec2017-05Jan2021.csv"
/Users/anna/Documents/GitHub/Portfolio/projects/CyclingPollutionVauxhall/data/raw/TfLJourneyDataExtract27Dec2017-05Jan2021.csv.zip
bike_df = pd.read_csv(csv_path, dtype={'Duration': 'Int64','StartStation Id': 'Int64','EndStation Id': 'Int64'})
bike_df

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,73667188,720,7315,06/03/2018 07:18,282,"Royal London Hospital, Whitechapel",06/03/2018 07:06,30,"Windsor Terrace, Hoxton"
1,73693268,540,3500,06/03/2018 23:14,95,"Aldersgate Street, Barbican",06/03/2018 23:05,751,"Newton Street, Covent Garden"
2,73659546,840,5646,05/03/2018 18:14,488,"Reardon Street, Wapping",05/03/2018 18:00,3,"Christopher Street, Liverpool Street"
3,73627763,1560,6139,03/03/2018 20:32,60,"Lisson Grove, St. John's Wood",03/03/2018 20:06,203,"West Smithfield Rotunda, Farringdon"
4,73638919,480,12516,04/03/2018 20:04,107,"Finsbury Leisure Centre, St. Luke's",04/03/2018 19:56,203,"West Smithfield Rotunda, Farringdon"
...,...,...,...,...,...,...,...,...,...
31396214,73752956,1260,6687,09/03/2018 10:55,13,"Scala Street, Fitzrovia",09/03/2018 10:34,326,"Graham Street, Angel"
31396215,73839092,300,10403,13/03/2018 16:56,30,"Windsor Terrace, Hoxton",13/03/2018 16:51,253,"Shoreditch Park, Hoxton"
31396216,73845410,780,2357,13/03/2018 18:55,183,"Riverlight North, Nine Elms",13/03/2018 18:42,196,"Union Street, The Borough"
31396217,73833185,240,2649,13/03/2018 11:56,64,"William IV Street, Strand",13/03/2018 11:52,228,"St. James's Square, St. James's"


Checking the datatypes

In [3]:
bike_df.dtypes

Rental Id             int64
Duration              Int64
Bike Id               int64
End Date             object
EndStation Id         Int64
EndStation Name      object
Start Date           object
StartStation Id       Int64
StartStation Name    object
dtype: object

Here I renamed the columns to exclude spaces and to ensure consistency across the data, as well as using pandas tools to define the date and time of cycle hire.

In [4]:
#Renaming columns
bike_df.rename(columns={'Rental Id': 'RentalID', 'Start Date': 'StartDateTime', 'StartStation Id': 'StartStationID', 'StartStation Name': 'StartStation','End Date': 'EndDateTime', 'EndStation Id': 'EndStationID', 'EndStation Name': 'EndStation', 'Bike Id': 'BikeID', 'Bike model': 'BikeModel', 'Duration': 'Duration_Seconds'}, inplace=True)
                
#Format date
bike_df[['StartDateTime','EndDateTime']] = bike_df[['StartDateTime','EndDateTime']].apply(pd.to_datetime, format='%d/%m/%Y %H:%M')
bike_df.head()

Unnamed: 0,RentalID,Duration_Seconds,BikeID,EndDateTime,EndStationID,EndStation,StartDateTime,StartStationID,StartStation
0,73667188,720,7315,2018-03-06 07:18:00,282,"Royal London Hospital, Whitechapel",2018-03-06 07:06:00,30,"Windsor Terrace, Hoxton"
1,73693268,540,3500,2018-03-06 23:14:00,95,"Aldersgate Street, Barbican",2018-03-06 23:05:00,751,"Newton Street, Covent Garden"
2,73659546,840,5646,2018-03-05 18:14:00,488,"Reardon Street, Wapping",2018-03-05 18:00:00,3,"Christopher Street, Liverpool Street"
3,73627763,1560,6139,2018-03-03 20:32:00,60,"Lisson Grove, St. John's Wood",2018-03-03 20:06:00,203,"West Smithfield Rotunda, Farringdon"
4,73638919,480,12516,2018-03-04 20:04:00,107,"Finsbury Leisure Centre, St. Luke's",2018-03-04 19:56:00,203,"West Smithfield Rotunda, Farringdon"


I ordered them by RentalID as a way to sort the data – I'm making the assumption that the RentalID will be ordered from oldest to newest.

In [5]:
bike_df = bike_df[['RentalID', 'BikeID','StartDateTime','StartStationID','StartStation','EndDateTime','EndStationID','EndStation','Duration_Seconds']]
bike_df = bike_df.sort_values('RentalID')
bike_df.head(50)

Unnamed: 0,RentalID,BikeID,StartDateTime,StartStationID,StartStation,EndDateTime,EndStationID,EndStation,Duration_Seconds
9542246,57571323,1004,2016-08-24 00:00:00,210,"Hinde Street, Marylebone",2016-08-24 00:19:00,261,"Princes Square, Bayswater",1140
9542248,57571324,8925,2016-08-24 00:00:00,477,"Spindrift Avenue, Millwall",2016-08-24 00:21:00,554,"Aberfeldy Street, Poplar",1260
9542247,57571325,3100,2016-08-24 00:00:00,188,"Nutford Place, Marylebone",2016-08-24 00:18:00,265,"Southwick Street, Paddington",1080
9542250,57571326,11247,2016-08-24 00:00:00,273,"Belvedere Road, South Bank",2016-08-24 00:18:00,246,"Berry Street, Clerkenwell",1080
9542251,57571327,256,2016-08-24 00:00:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:42:00,146,"Vauxhall Bridge , Pimlico",2520
9542249,57571328,11493,2016-08-24 00:00:00,799,"Kings Gate House, Westminster",2016-08-24 00:30:00,515,"Russell Gardens, Olympia",1800
9542255,57571329,5103,2016-08-24 00:01:00,447,"Jubilee Crescent, Cubitt Town",2016-08-24 00:44:00,586,"Mudchute DLR, Cubitt Town",2580
9542254,57571330,2151,2016-08-24 00:01:00,104,"Crosswall, Tower",2016-08-24 00:16:00,125,"Borough High Street, The Borough",900
9542253,57571331,3407,2016-08-24 00:01:00,266,"Queen's Gate (North), Kensington",2016-08-24 00:14:00,599,"Manbre Road, Hammersmith",780
9542256,57571332,14145,2016-08-24 00:01:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:59:00,710,"Albert Bridge Road, Battersea Park",3480


# Missing Data
Before going further, I assessed how much data is missing for each column to see if the data could be considered relaible.

In [6]:
# Counting null values in all columns
null_count = bike_df.isnull().sum()
null_percentage = (null_count / len(bike_df) * 100).round(2)
print('Total null count:\n',null_count,'\n\n\nPercentage of values missing:\n',null_percentage)

Total null count:
 RentalID               0
BikeID                 0
StartDateTime          0
StartStationID         0
StartStation           0
EndDateTime         3557
EndStationID        3585
EndStation          3585
Duration_Seconds    3557
dtype: int64 


Percentage of values missing:
 RentalID            0.00
BikeID              0.00
StartDateTime       0.00
StartStationID      0.00
StartStation        0.00
EndDateTime         0.01
EndStationID        0.01
EndStation          0.01
Duration_Seconds    0.01
dtype: float64


The acceptable rate of missing values is 5%, so a missing rate of 0.1% is acceptable for this dataset. I decided to drop these records:

In [7]:
bike_df = bike_df.dropna()
bike_df.head(20)

Unnamed: 0,RentalID,BikeID,StartDateTime,StartStationID,StartStation,EndDateTime,EndStationID,EndStation,Duration_Seconds
9542246,57571323,1004,2016-08-24 00:00:00,210,"Hinde Street, Marylebone",2016-08-24 00:19:00,261,"Princes Square, Bayswater",1140
9542248,57571324,8925,2016-08-24 00:00:00,477,"Spindrift Avenue, Millwall",2016-08-24 00:21:00,554,"Aberfeldy Street, Poplar",1260
9542247,57571325,3100,2016-08-24 00:00:00,188,"Nutford Place, Marylebone",2016-08-24 00:18:00,265,"Southwick Street, Paddington",1080
9542250,57571326,11247,2016-08-24 00:00:00,273,"Belvedere Road, South Bank",2016-08-24 00:18:00,246,"Berry Street, Clerkenwell",1080
9542251,57571327,256,2016-08-24 00:00:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:42:00,146,"Vauxhall Bridge , Pimlico",2520
9542249,57571328,11493,2016-08-24 00:00:00,799,"Kings Gate House, Westminster",2016-08-24 00:30:00,515,"Russell Gardens, Olympia",1800
9542255,57571329,5103,2016-08-24 00:01:00,447,"Jubilee Crescent, Cubitt Town",2016-08-24 00:44:00,586,"Mudchute DLR, Cubitt Town",2580
9542254,57571330,2151,2016-08-24 00:01:00,104,"Crosswall, Tower",2016-08-24 00:16:00,125,"Borough High Street, The Borough",900
9542253,57571331,3407,2016-08-24 00:01:00,266,"Queen's Gate (North), Kensington",2016-08-24 00:14:00,599,"Manbre Road, Hammersmith",780
9542256,57571332,14145,2016-08-24 00:01:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:59:00,710,"Albert Bridge Road, Battersea Park",3480


Showing the final null count:

In [8]:
null_count = bike_df.isnull().sum()
print('Final null count:\n',null_count,)

Final null count:
 RentalID            0
BikeID              0
StartDateTime       0
StartStationID      0
StartStation        0
EndDateTime         0
EndStationID        0
EndStation          0
Duration_Seconds    0
dtype: int64


# Duplicates
I wanted to see if any rental IDs are duplicates:

In [9]:
num_duplicates = len(bike_df)-len(bike_df.drop_duplicates())
num_duplicates

0

Lastly, I set the RentalID to be the index for the dataframe.

In [10]:
bike_df.set_index('RentalID', inplace=True)
bike_df.sort_index(inplace=True)
bike_df.head(20)

Unnamed: 0_level_0,BikeID,StartDateTime,StartStationID,StartStation,EndDateTime,EndStationID,EndStation,Duration_Seconds
RentalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
57571323,1004,2016-08-24 00:00:00,210,"Hinde Street, Marylebone",2016-08-24 00:19:00,261,"Princes Square, Bayswater",1140
57571324,8925,2016-08-24 00:00:00,477,"Spindrift Avenue, Millwall",2016-08-24 00:21:00,554,"Aberfeldy Street, Poplar",1260
57571325,3100,2016-08-24 00:00:00,188,"Nutford Place, Marylebone",2016-08-24 00:18:00,265,"Southwick Street, Paddington",1080
57571326,11247,2016-08-24 00:00:00,273,"Belvedere Road, South Bank",2016-08-24 00:18:00,246,"Berry Street, Clerkenwell",1080
57571327,256,2016-08-24 00:00:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:42:00,146,"Vauxhall Bridge , Pimlico",2520
57571328,11493,2016-08-24 00:00:00,799,"Kings Gate House, Westminster",2016-08-24 00:30:00,515,"Russell Gardens, Olympia",1800
57571329,5103,2016-08-24 00:01:00,447,"Jubilee Crescent, Cubitt Town",2016-08-24 00:44:00,586,"Mudchute DLR, Cubitt Town",2580
57571330,2151,2016-08-24 00:01:00,104,"Crosswall, Tower",2016-08-24 00:16:00,125,"Borough High Street, The Borough",900
57571331,3407,2016-08-24 00:01:00,266,"Queen's Gate (North), Kensington",2016-08-24 00:14:00,599,"Manbre Road, Hammersmith",780
57571332,14145,2016-08-24 00:01:00,578,"Hollybush Gardens, Bethnal Green",2016-08-24 00:59:00,710,"Albert Bridge Road, Battersea Park",3480


# Output

I outputted the results of the data cleaning to a new csv file so that I can use it in other notebooks.

In [11]:
bike_df.to_csv('/kaggle/working/bike_df.csv')