In [1]:
import requests
import zipfile
import pandas as pd
from contextlib import closing
import io
import pyarrow
import os
from sqlalchemy import create_engine
import sqlite3

In [2]:
def extract_clean(save_path, file_list):
    header_format = {
        'tripduration':'trip_duration',
        'bikeid':'ride_id',
        'bike_id':'ride_id',
        'starttime':'started_at',
        'stoptime':'ended_at',
        'start station name':'start_station_name',
        'start station id':'start_staton_id',
        'end station name':'end_station_name',
        'end station id':'end_station_id',
        'start station latitude':'start_lat',
        'start station longitude':'start_lng',
        'end station latitude':'end_lat',
        'end station longitude':'end_lng',
        'usertype':'member_casual',
        'birth year':'Birth Year',
        'gender':'Gender',
        'trip duration':'trip_duration',
        'Bike ID':'ride_id',
        'Start Time':'started_at',
        'Stop Time':'ended_at',
        'Start Station Name':'start_station_name',
        'Start Station ID':'start_station_id',
        'End Station Name':'end_station_name',
        'End Station ID':'end_station_id',
        'Start Station Latitude':'start_lat',
        'Start Station Longitude':'start_lng',
        'End Station Latitude':'end_lat',
        'End Station Longitude':'end_lng',
        'User Type':'member_casual',
        'Birth Year':'Birth Year',
        'Gender':'Gender',
        'Trip Duration':'trip_duration'
    }
    data_types = {
        'rideable_type':'category',
        'start_station_name':'category',
        'start_station_id':'category',
        'end_station_name':'category',
        'end_station_id':'category',
        'start_lat':'category',
        'start_lng':'category',
        'end_lat':'category',
        'end_lng':'category',
        'member_casual':'category',
        'start station name':'category',
        'start station id':'category',
        'end station name':'category',
        'end station id':'category',
        'start station latitude':'category',
        'start station longitude':'category',
        'end station latitude':'category',
        'end station longitude':'category',
        'usertype':'category',
        'birth year':'category',
        'gender':'category',
        'Start Station Name':'category',
        'Start Station ID':'category',
        'End Station Name':'category',
        'End Station ID':'category',
        'Start Station Latitude':'category',
        'Start Station Longitude':'category',
        'End Station Latitude':'category',
        'End Station Longitude':'category',
        'User Type':'category',
        'Birth Year':'category',
        'Gender':'category'
    }
    for file in file_list:
        url1 = 'https://s3.amazonaws.com/tripdata/' + file + '.zip'
        url2 = 'https://s3.amazonaws.com/tripdata/' + file + '.csv.zip'
        
        #attempt to locate the specified file
        response = requests.get(url1)
        if response.status_code != 200:
            response = requests.get(url2)
            if response.status_code != 200:
                print(f'{file} is unavailable')
                continue

        # rename the column headers and replace entries to match the 2021 format, at the end save the data as a csv file
        with closing(response), zipfile.ZipFile(io.BytesIO(response.content)) as zip_contents:
            file_list = zip_contents.namelist()
            with zip_contents.open(file_list[0]) as tempfile:
                bike_df = pd.read_csv(tempfile, dtype = data_types)
                bike_df = bike_df.rename(columns=header_format)     
                bike_df['member_casual'] = bike_df['member_casual'].replace({
                    'Subscriber':'member',
                    'Customer':'casual'
                })
                bike_df.sort_values(by=['started_at'],ignore_index=True)    
                bike_df.to_sql('bikedata', con = engine, if_exists='append',index=False)
            tempfile.close()
        zip_contents.close()
        response.close()
        del url1, url2, response 
        del file_list, tempfile, bike_df, zip_contents
        print(f'Successfully extracted and cleaned {file}')

In [6]:
connection = sqlite3.connect(save_directory)
db_cursor = connection.cursor()

db_cursor.execute("""
ALTER TABLE bikedata
RENAME COLUMN 'start_staton_id' TO 'start_station_id'
""")

In [3]:
# generate list of file names to download and clean
years = ['2013','2014','2015','2016','2017', '2018','2019', '2020', '2021']
months = ['01','02','03','04','05','06','07','08','09','10','11','12']
save_directory = '../../../Data_Sets/citibike/'
db_name = '201306-202106-citibike-tripdata.sqlite'
engine = create_engine(f'sqlite:///{save_directory}{db_name}')

In [10]:
for year in years:
    file_names = [f'{year}{month}-citibike-tripdata' for month in months]
    save_name = f'{year}-citibike-tripdata.feather'
    extract_clean(f'{save_directory}{save_name}', file_names)

Successfully extracted and cleaned 202102-citibike-tripdata
Successfully extracted and cleaned 202103-citibike-tripdata
Successfully extracted and cleaned 202104-citibike-tripdata
Successfully extracted and cleaned 202105-citibike-tripdata
Successfully extracted and cleaned 202106-citibike-tripdata
202107-citibike-tripdata is unavailable
202108-citibike-tripdata is unavailable
202109-citibike-tripdata is unavailable
202110-citibike-tripdata is unavailable
202111-citibike-tripdata is unavailable
202112-citibike-tripdata is unavailable


In [6]:
bikeinfo_df = pd.read_sql("""
SELECT 
    ride_id, MIN(started_at), MAX(started_at),
    SUM(ROUND((JULIANDAY(ended_at) - JULIANDAY(started_at)) * 86400)),
    COUNT(started_at)
FROM bikedata
GROUP BY ride_id
WHERE started_at < DATE(2021-02-01);
    """,con = engine)

OperationalError: (sqlite3.OperationalError) near "ride_id": syntax error
[SQL: SELECT 
    ride_id, MIN(started_at), MAX(started_at),
    SUM(ROUND((JULIANDAY(ended_at) - JULIANDAY(started_at)) * 86400)),
    COUNT(started_at)
    FROM bikedata
    GROUPBY ride_id
    WHERE started_at < DATE(2021-02-01);
    ]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [None]:
bikeinfo_df = pd.read_sql(
    """SELECT 
   ride_id, MIN(started_at), MAX(started_at),
    SUM(ROUND((JULIANDAY(ended_at) - JULIANDAY(started_at)) * 86400)),
    COUNT(started_at)
    FROM bikedata
    """,con = engine)

In [8]:
test_df = test_df.groupby('ride_id').agg(
    first_ride=('started_at','first'),
    last_ride=('started_at','last')
)

In [34]:
test_df['lifespan']= pd.to_datetime(test_df['last_ride']).dt.year-pd.to_datetime(test_df['first_ride']).dt.year

In [None]:
# filter data to remove bikes rides where bike_id is not logged
test_df = test_df.loc[test_df['lifespan'] > pd.to_timedelta('0 days 00:00:00')]

In [40]:
test_df.count()

first_ride    32926
last_ride     32926
lifespan      32926
dtype: int64

In [26]:
test_df = pd.read_csv(f'{save_directory}bike_lifespan.csv')

TypeError: unsupported Type Index

In [41]:
test_df.to_csv(f'{save_directory}bike_lifespan.csv')

In [20]:
display(test_df)
display(test_df.info())

Unnamed: 0_level_0,first_ride,last_ride,lifespan
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2021-02-15 17:30:50,2021-06-13 09:48:57,118 days
14529.0,2013-06-05 00:09:05,2020-11-09 08:59:36.0430,2714 days
14530.0,2014-01-14 19:18:18,2020-12-11 20:06:11.6900,2523 days
14531.0,2013-06-01 11:24:44,2020-11-09 15:42:37.0330,2718 days
14532.0,2013-06-02 08:57:20,2019-05-26 08:03:36.1530,2184 days
...,...,...,...
50105.0,2020-12-12 14:15:19.2010,2020-12-16 08:48:07.8760,4 days
50106.0,2020-12-21 07:54:39.5760,2020-12-24 17:17:11.8490,3 days
50123.0,2021-01-18 12:07:01.4490,2021-01-30 15:39:34.4650,12 days
50124.0,2021-01-18 06:44:33.0390,2021-01-31 08:37:50.8730,13 days


<class 'pandas.core.frame.DataFrame'>
Index: 32926 entries, 0 to inf
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype          
---  ------      --------------  -----          
 0   first_ride  32926 non-null  object         
 1   last_ride   32926 non-null  object         
 2   lifespan    32926 non-null  timedelta64[ns]
dtypes: object(2), timedelta64[ns](1)
memory usage: 1.0+ MB


None

In [30]:
test_df = pd.read_sql("""
SELECT * 
FROM bikedata
WHERE strftime('%Y', started_at) > 2015
limit 5;
""", con = engine)

In [31]:
test_df

Unnamed: 0,tripduration,started_at,ended_at,start_staton_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,ride_id,member_casual,Birth Year,Gender
0,695,2013-06-01 00:00:01,2013-06-01 00:11:36,444,Broadway & W 24 St,40.7423543,-73.98915076,434,9 Ave & W 18 St,40.74317449,-74.00366443,19678,member,1983.0,1
1,693,2013-06-01 00:00:08,2013-06-01 00:11:41,444,Broadway & W 24 St,40.7423543,-73.98915076,434,9 Ave & W 18 St,40.74317449,-74.00366443,16649,member,1984.0,1
2,2059,2013-06-01 00:00:44,2013-06-01 00:35:03,406,Hicks St & Montague St,40.69512845,-73.99595065,406,Hicks St & Montague St,40.69512845,-73.99595065,19599,casual,,0
3,123,2013-06-01 00:01:04,2013-06-01 00:03:07,475,E 15 St & Irving Pl,40.73524276,-73.98758561,262,Washington Park,40.6917823,-73.9737299,16352,member,1960.0,1
4,1521,2013-06-01 00:01:22,2013-06-01 00:26:43,2008,Little West St & 1 Pl,40.70569254,-74.01677685,310,State St & Smith St,40.68926942,-73.98912867,15567,member,1983.0,1


In [6]:
test_df

Unnamed: 0,ride_id,started_at
0,19678,2013-06-01 00:00:01
1,16649,2013-06-01 00:00:08
2,19599,2013-06-01 00:00:44
3,16352,2013-06-01 00:01:04
4,15567,2013-06-01 00:01:22
...,...,...
122697047,5CCF02508D6C4209,2021-06-30 13:13:05
122697048,C4D61E2B06185BC1,2021-06-23 22:51:46
122697049,B58C37145FC92C4B,2021-06-19 16:18:58
122697050,C166DBE51A54FDAE,2021-06-23 11:29:37


In [13]:
del test_df