In [1]:
import requests
import zipfile
import pandas as pd
import io
from sqlalchemy import create_engine

In [2]:
# generate list of file names to download and clean
years = ['2013','2014','2015','2016','2017', '2018','2019', '2020']
months = ['01','02','03','04','05','06','07','08','09','10','11','12']
save_directory = '../../../Data_Sets/citibike/'
db_name = '201306-202101-citibike-tripdata.sqlite'
engine = create_engine(f'sqlite:///{save_directory}{db_name}')
file_names = [f'{year}{month}-citibike-tripdata' for year in years for month in months]
file_names.append('202101-citibike-tripdata')

In [3]:
def extract_clean(save_path, file_list):
    header_format = {
        'tripduration':'trip_duration',
        'bikeid':'ride_id',
        'bike_id':'ride_id',
        'starttime':'started_at',
        'stoptime':'ended_at',
        'start station name':'start_station_name',
        'start station id':'start_station_id',
        'end station name':'end_station_name',
        'end station id':'end_station_id',
        'start station latitude':'start_lat',
        'start station longitude':'start_lng',
        'end station latitude':'end_lat',
        'end station longitude':'end_lng',
        'usertype':'member_casual',
        'birth year':'Birth Year',
        'gender':'Gender',
        'trip duration':'trip_duration',
        'Bike ID':'ride_id',
        'Start Time':'started_at',
        'Stop Time':'ended_at',
        'Start Station Name':'start_station_name',
        'Start Station ID':'start_station_id',
        'End Station Name':'end_station_name',
        'End Station ID':'end_station_id',
        'Start Station Latitude':'start_lat',
        'Start Station Longitude':'start_lng',
        'End Station Latitude':'end_lat',
        'End Station Longitude':'end_lng',
        'User Type':'member_casual',
        'Birth Year':'Birth Year',
        'Trip Duration':'trip_duration'
    }
    for file in file_list:
        url1 = 'https://s3.amazonaws.com/tripdata/' + file + '.zip'
        url2 = 'https://s3.amazonaws.com/tripdata/' + file + '.csv.zip'
        
        #attempt to locate the specified file
        response = requests.get(url1)
        if response.status_code != 200:
            response = requests.get(url2)
            if response.status_code != 200:
                print(f'{file} is unavailable')
                continue

        # rename the column headers, standardize entry formats, and insert the file contents into a sqlite database
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_contents:
            file_list = zip_contents.namelist()
            with zip_contents.open(file_list[0]) as tempfile:
                bike_df = pd.read_csv(tempfile)
                bike_df = bike_df.rename(columns=header_format)
                bike_df['started_at'] = (pd.to_datetime(bike_df['started_at'])
                                         .dt.strftime('%Y-%m-%d %H:%M:%S'))
                bike_df['ended_at'] = (pd.to_datetime(bike_df['ended_at'])
                                       .dt.strftime('%Y-%m-%d %H:%M:%S'))
                bike_df['member_casual'] = bike_df['member_casual'].replace({
                    'Subscriber':'member',
                    'Customer':'casual'
                })
                bike_df.sort_values(by=['started_at'],ignore_index=True)    
                bike_df.to_sql('bikedata', con = engine, if_exists='append',index=False)
            tempfile.close()
        zip_contents.close()
        response.close()
        del url1, url2, response 
        del file_list, tempfile, bike_df, zip_contents
        print(f'Successfully extracted and cleaned {file}')

In [4]:
extract_clean(f'{save_directory}{db_name}', file_names)

201301-citibike-tripdata is unavailable
201302-citibike-tripdata is unavailable
201303-citibike-tripdata is unavailable
201304-citibike-tripdata is unavailable
201305-citibike-tripdata is unavailable
Successfully extracted and cleaned 201306-citibike-tripdata
Successfully extracted and cleaned 201307-citibike-tripdata
Successfully extracted and cleaned 201308-citibike-tripdata
Successfully extracted and cleaned 201309-citibike-tripdata
Successfully extracted and cleaned 201310-citibike-tripdata
Successfully extracted and cleaned 201311-citibike-tripdata
Successfully extracted and cleaned 201312-citibike-tripdata
Successfully extracted and cleaned 201401-citibike-tripdata
Successfully extracted and cleaned 201402-citibike-tripdata
Successfully extracted and cleaned 201403-citibike-tripdata
Successfully extracted and cleaned 201404-citibike-tripdata
Successfully extracted and cleaned 201405-citibike-tripdata
Successfully extracted and cleaned 201406-citibike-tripdata
Successfully extract

# Statistics on Bikes

In [16]:
# the WHERE statement here is used to remove rows where the bike_id is not tracked
bikeinfo_df = pd.read_sql("""
SELECT 
    ride_id AS "Bike ID",
    MIN(started_at) AS "First Ride",
    MAX(started_at) AS "Last Ride",
    SUM(ROUND((JULIANDAY(ended_at) - JULIANDAY(started_at)) * 86400)) AS "Total Ride Time (Seconds)",
    COUNT(started_at) AS "Total Rides"
FROM bikedata
WHERE strftime('%Y-%m', started_at) < '2021-02'
GROUP BY ride_id;
    """,con = engine)

In [17]:
bikeinfo_df['Life Span (Years)'] = pd.to_datetime(bikeinfo_df['Last Ride']).dt.year-pd.to_datetime(bikeinfo_df['First Ride']).dt.year

In [18]:
display(bikeinfo_df)
display(bikeinfo_df.info())

Unnamed: 0,Bike ID,First Ride,Last Ride,Total Ride Time (Seconds),Total Rides,Life Span (Years)
0,14529,2013-06-05 00:09:05,2020-11-09 08:59:36,8781904.0,8753,7
1,14530,2014-01-14 19:18:18,2020-12-11 20:06:11,7225321.0,7272,6
2,14531,2013-06-01 11:24:44,2020-11-09 15:42:37,7559708.0,7830,7
3,14532,2013-06-02 08:57:20,2019-05-26 08:03:36,7052137.0,6810,6
4,14533,2013-06-06 11:09:23,2020-12-01 16:59:28,7939666.0,8695,7
...,...,...,...,...,...,...
33102,50105,2020-12-12 14:15:19,2020-12-16 08:48:07,9040.0,12,0
33103,50106,2020-12-21 07:54:39,2020-12-24 17:17:11,5280.0,7,0
33104,50107,2020-12-18 10:27:38,2020-12-18 10:27:38,668.0,1,0
33105,50123,2021-01-18 12:07:01,2021-01-30 15:39:34,48260.0,61,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33107 entries, 0 to 33106
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Bike ID                    33107 non-null  int64  
 1   First Ride                 33107 non-null  object 
 2   Last Ride                  33107 non-null  object 
 3   Total Ride Time (Seconds)  33107 non-null  float64
 4   Total Rides                33107 non-null  int64  
 5   Life Span (Years)          33107 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 1.5+ MB


None

In [19]:
bikeinfo_df.to_csv(f'{save_directory}bike_lifespan.csv')

# Statistics on Stations

In [5]:
# retrieve statistics on the number of trips starting at each station
start_stationinfo = pd.read_sql("""
SELECT 
    start_station_name AS "Station Name",
    strftime('%Y-%m', started_at) AS Month,
    start_station_id AS "Station ID",
    start_lat AS "Station Latitude",
    start_lng AS "Station Longitude",
    COUNT(start_station_id) AS "Trips From",
    MIN(started_at) AS "First Station Departure",
    MAX(started_at) AS "Last Station Departure",
    COUNT(DISTINCT end_station_name) AS "Stations Visited"
FROM bikedata
GROUP BY Month, "Station Name";
    """,con = engine)

In [6]:
display(start_stationinfo)
display(start_stationinfo.info())

Unnamed: 0,Station Name,Month,Station ID,Station Latitude,Station Longitude,Trips From,First Station Departure,Last Station Departure,Stations Visited
0,1 Ave & E 16 St,2013-06,504.0,40.732219,-73.981656,2331,2013-06-01 00:35:14,2013-06-30 23:37:45,274
1,1 Ave & E 18 St,2013-06,2003.0,40.733812,-73.980544,1690,2013-06-01 09:05:23,2013-06-30 23:52:15,261
2,1 Ave & E 30 St,2013-06,536.0,40.741444,-73.975361,1692,2013-06-01 08:25:12,2013-06-30 22:45:54,248
3,1 Ave & E 44 St,2013-06,455.0,40.750020,-73.969053,1728,2013-06-01 07:57:29,2013-06-30 23:46:16,254
4,10 Ave & W 28 St,2013-06,489.0,40.750664,-74.001768,2094,2013-06-01 00:08:47,2013-06-30 23:20:56,251
...,...,...,...,...,...,...,...,...,...
57130,Woodward Ave & Harman St,2021-01,3888.0,40.707930,-73.910920,128,2021-01-01 18:50:17,2021-01-31 18:13:16,61
57131,Wyckoff Av & Jefferson St,2021-01,3763.0,40.707165,-73.923711,279,2021-01-01 02:41:55,2021-01-31 21:37:14,113
57132,Wyckoff Av & Stanhope St,2021-01,3780.0,40.703545,-73.917775,424,2021-01-01 01:17:43,2021-01-31 20:25:22,140
57133,Wyckoff St & Nevins St,2021-01,3911.0,40.683426,-73.984275,457,2021-01-01 09:32:41,2021-01-31 17:11:54,155


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57135 entries, 0 to 57134
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Station Name             57125 non-null  object 
 1   Month                    57135 non-null  object 
 2   Station ID               57125 non-null  float64
 3   Station Latitude         57135 non-null  float64
 4   Station Longitude        57135 non-null  float64
 5   Trips From               57135 non-null  int64  
 6   First Station Departure  57135 non-null  object 
 7   Last Station Departure   57135 non-null  object 
 8   Stations Visited         57135 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.9+ MB


None

In [65]:
start_stationinfo = start_stationinfo.dropna(axis=0)
start_stationinfo

Unnamed: 0,Station Name,Month,Station ID,Station Latitude,Station Longitude,Trips From,First Station Departure,Last Station Departure,Stations Visited
0,1 Ave & E 16 St,2013-06,504.0,40.732219,-73.981656,2331,2013-06-01 00:35:14,2013-06-30 23:37:45,274
1,1 Ave & E 18 St,2013-06,2003.0,40.733812,-73.980544,1690,2013-06-01 09:05:23,2013-06-30 23:52:15,261
2,1 Ave & E 30 St,2013-06,536.0,40.741444,-73.975361,1692,2013-06-01 08:25:12,2013-06-30 22:45:54,248
3,1 Ave & E 44 St,2013-06,455.0,40.750020,-73.969053,1728,2013-06-01 07:57:29,2013-06-30 23:46:16,254
4,10 Ave & W 28 St,2013-06,489.0,40.750664,-74.001768,2094,2013-06-01 00:08:47,2013-06-30 23:20:56,251
...,...,...,...,...,...,...,...,...,...
57130,Woodward Ave & Harman St,2021-01,3888.0,40.707930,-73.910920,128,2021-01-01 18:50:17,2021-01-31 18:13:16,61
57131,Wyckoff Av & Jefferson St,2021-01,3763.0,40.707165,-73.923711,279,2021-01-01 02:41:55,2021-01-31 21:37:14,113
57132,Wyckoff Av & Stanhope St,2021-01,3780.0,40.703545,-73.917775,424,2021-01-01 01:17:43,2021-01-31 20:25:22,140
57133,Wyckoff St & Nevins St,2021-01,3911.0,40.683426,-73.984275,457,2021-01-01 09:32:41,2021-01-31 17:11:54,155


In [14]:
# retrieve statistics on the number of trips starting at each station
end_stationinfo = pd.read_sql("""
SELECT 
    end_station_name AS "Station Name",
    strftime('%Y-%m', ended_at) AS Month,
    end_station_id AS "Station ID",
    end_lat AS "Station Latitude",
    end_lng AS "Station Longitude",
    COUNT(end_station_id) AS "Trips To",
    MIN(ended_at) AS "First Station Arrival",
    MAX(ended_at) AS "Last Station Arrival",
    COUNT(DISTINCT end_station_name) AS "Stations Departed From"
FROM bikedata
GROUP BY Month, "Station Name";
    """,con = engine)

In [15]:
display(end_stationinfo)
display(end_stationinfo.info())

Unnamed: 0,Station Name,Month,Station ID,Station Latitude,Station Longitude,Trips To,First Station Arrival,Last Station Arrival,Stations Departed From
0,,2013-06,,,,0,2013-06-01 00:20:11,2013-06-30 23:53:10,0
1,1 Ave & E 16 St,2013-06,504.0,40.732219,-73.981656,2328,2013-06-01 00:40:09,2013-06-30 23:55:16,1
2,1 Ave & E 18 St,2013-06,2003.0,40.733812,-73.980544,1636,2013-06-01 11:03:58,2013-06-30 23:17:24,1
3,1 Ave & E 30 St,2013-06,536.0,40.741444,-73.975361,1588,2013-06-01 09:12:14,2013-06-30 21:38:17,1
4,1 Ave & E 44 St,2013-06,455.0,40.750020,-73.969053,1729,2013-06-01 06:59:01,2013-06-30 22:56:48,1
...,...,...,...,...,...,...,...,...,...
57813,W 4 St & 7 Ave S,2021-02,380.0,40.734011,-74.002939,1,2021-02-04 12:01:27,2021-02-04 12:01:27,1
57814,W 95 St & Broadway,2021-02,3314.0,40.793770,-73.971888,1,2021-02-06 20:39:05,2021-02-06 20:39:05,1
57815,Washington Ave & Park Ave,2021-02,313.0,40.696102,-73.967510,1,2021-02-01 16:39:53,2021-02-01 16:39:53,1
57816,Water - Whitehall Plaza,2021-02,534.0,40.702551,-74.012723,1,2021-02-03 14:36:17,2021-02-03 14:36:17,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57818 entries, 0 to 57817
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Station Name            57806 non-null  object 
 1   Month                   57818 non-null  object 
 2   Station ID              57806 non-null  float64
 3   Station Latitude        57816 non-null  float64
 4   Station Longitude       57816 non-null  float64
 5   Trips To                57818 non-null  int64  
 6   First Station Arrival   57818 non-null  object 
 7   Last Station Arrival    57818 non-null  object 
 8   Stations Departed From  57818 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 4.0+ MB


None

In [64]:
end_stationinfo = end_stationinfo.dropna(axis=0)
end_stationinfo

Unnamed: 0,Station Name,Month,Station ID,Station Latitude,Station Longitude,Trips To,First Station Arrival,Last Station Arrival,Stations Departed From
1,1 Ave & E 16 St,2013-06,504.0,40.732219,-73.981656,2328,2013-06-01 00:40:09,2013-06-30 23:55:16,1
2,1 Ave & E 18 St,2013-06,2003.0,40.733812,-73.980544,1636,2013-06-01 11:03:58,2013-06-30 23:17:24,1
3,1 Ave & E 30 St,2013-06,536.0,40.741444,-73.975361,1588,2013-06-01 09:12:14,2013-06-30 21:38:17,1
4,1 Ave & E 44 St,2013-06,455.0,40.750020,-73.969053,1729,2013-06-01 06:59:01,2013-06-30 22:56:48,1
5,10 Ave & W 28 St,2013-06,489.0,40.750664,-74.001768,2046,2013-06-01 07:45:23,2013-06-30 23:28:05,1
...,...,...,...,...,...,...,...,...,...
57813,W 4 St & 7 Ave S,2021-02,380.0,40.734011,-74.002939,1,2021-02-04 12:01:27,2021-02-04 12:01:27,1
57814,W 95 St & Broadway,2021-02,3314.0,40.793770,-73.971888,1,2021-02-06 20:39:05,2021-02-06 20:39:05,1
57815,Washington Ave & Park Ave,2021-02,313.0,40.696102,-73.967510,1,2021-02-01 16:39:53,2021-02-01 16:39:53,1
57816,Water - Whitehall Plaza,2021-02,534.0,40.702551,-74.012723,1,2021-02-03 14:36:17,2021-02-03 14:36:17,1


In [72]:
# Retrieve statistics on number of trips departing from each station
merged_stationinfo = pd.merge(start_stationinfo, end_stationinfo, on=['Station Name','Month','Station ID','Station Latitude','Station Longitude'], how = 'outer')

In [74]:
display(merged_stationinfo)
display(merged_stationinfo.info())
display(merged_stationinfo.isna().any())

Unnamed: 0,Station Name,Month,Station ID,Station Latitude,Station Longitude,Trips From,First Station Departure,Last Station Departure,Stations Visited,Trips To,First Station Arrival,Last Station Arrival,Stations Departed From
0,1 Ave & E 16 St,2013-06,504.0,40.732219,-73.981656,2331.0,2013-06-01 00:35:14,2013-06-30 23:37:45,274.0,2328.0,2013-06-01 00:40:09,2013-06-30 23:55:16,1.0
1,1 Ave & E 18 St,2013-06,2003.0,40.733812,-73.980544,1690.0,2013-06-01 09:05:23,2013-06-30 23:52:15,261.0,1636.0,2013-06-01 11:03:58,2013-06-30 23:17:24,1.0
2,1 Ave & E 30 St,2013-06,536.0,40.741444,-73.975361,1692.0,2013-06-01 08:25:12,2013-06-30 22:45:54,248.0,1588.0,2013-06-01 09:12:14,2013-06-30 21:38:17,1.0
3,1 Ave & E 44 St,2013-06,455.0,40.750020,-73.969053,1728.0,2013-06-01 07:57:29,2013-06-30 23:46:16,254.0,1729.0,2013-06-01 06:59:01,2013-06-30 22:56:48,1.0
4,10 Ave & W 28 St,2013-06,489.0,40.750664,-74.001768,2094.0,2013-06-01 00:08:47,2013-06-30 23:20:56,251.0,2046.0,2013-06-01 07:45:23,2013-06-30 23:28:05,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57811,W 4 St & 7 Ave S,2021-02,380.0,40.734011,-74.002939,,,,,1.0,2021-02-04 12:01:27,2021-02-04 12:01:27,1.0
57812,W 95 St & Broadway,2021-02,3314.0,40.793770,-73.971888,,,,,1.0,2021-02-06 20:39:05,2021-02-06 20:39:05,1.0
57813,Washington Ave & Park Ave,2021-02,313.0,40.696102,-73.967510,,,,,1.0,2021-02-01 16:39:53,2021-02-01 16:39:53,1.0
57814,Water - Whitehall Plaza,2021-02,534.0,40.702551,-74.012723,,,,,1.0,2021-02-03 14:36:17,2021-02-03 14:36:17,1.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 57816 entries, 0 to 57815
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Station Name             57816 non-null  object 
 1   Month                    57816 non-null  object 
 2   Station ID               57816 non-null  float64
 3   Station Latitude         57816 non-null  float64
 4   Station Longitude        57816 non-null  float64
 5   Trips From               57125 non-null  float64
 6   First Station Departure  57125 non-null  object 
 7   Last Station Departure   57125 non-null  object 
 8   Stations Visited         57125 non-null  float64
 9   Trips To                 57806 non-null  float64
 10  First Station Arrival    57806 non-null  object 
 11  Last Station Arrival     57806 non-null  object 
 12  Stations Departed From   57806 non-null  float64
dtypes: float64(7), object(6)
memory usage: 6.2+ MB


None

Station Name               False
Month                      False
Station ID                 False
Station Latitude           False
Station Longitude          False
Trips From                  True
First Station Departure     True
Last Station Departure      True
Stations Visited            True
Trips To                    True
First Station Arrival       True
Last Station Arrival        True
Stations Departed From      True
dtype: bool

# Exploration


In [9]:
test_df = pd.read_sql("""
SELECT * 
FROM bikedata
WHERE strftime('%Y', started_at) = '2016' and strftime('%m', started_at) = '03'
limit 5;
""", con = engine)

In [10]:
test_df

Unnamed: 0,trip_duration,started_at,ended_at,start_station_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,ride_id,member_casual,Birth Year,Gender
0,1491,2016-03-01 06:52:42,2016-03-01 07:17:33,72,W 52 St & 11 Ave,40.767272,-73.993929,427.0,Bus Slip & State St,40.701907,-74.013942,23914,member,1982.0,1
1,1044,2016-03-01 07:05:50,2016-03-01 07:23:15,72,W 52 St & 11 Ave,40.767272,-73.993929,254.0,W 11 St & 6 Ave,40.735324,-73.998004,23697,member,1978.0,1
2,714,2016-03-01 07:15:05,2016-03-01 07:26:59,72,W 52 St & 11 Ave,40.767272,-73.993929,493.0,W 45 St & 6 Ave,40.7568,-73.982912,21447,member,1960.0,2
3,329,2016-03-01 07:26:04,2016-03-01 07:31:34,72,W 52 St & 11 Ave,40.767272,-73.993929,478.0,11 Ave & W 41 St,40.760301,-73.998842,22351,member,1986.0,1
4,1871,2016-03-01 07:31:30,2016-03-01 08:02:41,72,W 52 St & 11 Ave,40.767272,-73.993929,151.0,Cleveland Pl & Spring St,40.722104,-73.997249,20985,member,1978.0,1


In [80]:
# retrieve statistics on the number of trips starting at each station
start_stationinfo_df = pd.read_sql("""
SELECT * FROM bikedata
limit 10000;
    """,con = engine1)

In [81]:
start_stationinfo_df

Unnamed: 0,trip_duration,started_at,ended_at,start_station_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,ride_id,member_casual,Birth Year,Gender
0,1346,1/1/2015 0:01,1/1/2015 0:24,455,1 Ave & E 44 St,40.750020,-73.969053,265,Stanton St & Chrystie St,40.722293,-73.991475,18660,member,1960.0,2
1,363,1/1/2015 0:02,1/1/2015 0:08,434,9 Ave & W 18 St,40.743174,-74.003664,482,W 15 St & 7 Ave,40.739355,-73.999318,16085,member,1963.0,1
2,346,1/1/2015 0:04,1/1/2015 0:10,491,E 24 St & Park Ave S,40.740964,-73.986022,505,6 Ave & W 33 St,40.749013,-73.988484,20845,member,1974.0,1
3,182,1/1/2015 0:04,1/1/2015 0:07,384,Fulton St & Waverly Ave,40.683178,-73.965964,399,Lafayette Ave & St James Pl,40.688515,-73.964763,19610,member,1969.0,1
4,969,1/1/2015 0:05,1/1/2015 0:21,474,5 Ave & E 29 St,40.745168,-73.986831,432,E 7 St & Avenue A,40.726218,-73.983799,20197,member,1977.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,316,1/2/2015 13:36,1/2/2015 13:41,491,E 24 St & Park Ave S,40.740964,-73.986022,536,1 Ave & E 30 St,40.741444,-73.975361,16896,member,1974.0,1
9996,298,1/2/2015 13:36,1/2/2015 13:41,404,9 Ave & W 14 St,40.740583,-74.005509,383,Greenwich Ave & Charles St,40.735238,-74.000271,14817,member,1969.0,1
9997,198,1/2/2015 13:36,1/2/2015 13:39,504,1 Ave & E 15 St,40.732219,-73.981656,297,E 15 St & 3 Ave,40.734232,-73.986923,16693,member,1998.0,1
9998,124,1/2/2015 13:36,1/2/2015 13:38,128,MacDougal St & Prince St,40.727103,-74.002971,347,W Houston St & Hudson St,40.728739,-74.007488,16845,member,1982.0,1
