In [13]:
import requests
import zipfile
import os
import shutil
import pandas as pd
from contextlib import closing
import io

In [57]:
save_directory = '../../../Data_Sets/citibike/'

def extract_clean(save_directory, file_list):
    header_format1 = {
        'bike_id':'ride_id',
        'starttime':'started_at',
        'stoptime':'ended_at',
        'start station name':'start_station_name',
        'start station id':'start_staton_id',
        'end station name':'end_station_name',
        'end station id':'end_station_id',
        'start station latitude':'start_lat',
        'start station longitude':'start_lng',
        'end station latitude':'end_lat',
        'end station longitude':'end_lng',
        'usertype':'member_casual',
        'birth year':'Birth Year',
        'gender':'Gender',
        'trip duration':'Trip Duration'
        }
    header_format2 = {
        'Bike ID':'ride_id',
        'Start Time':'started_at',
        'Stoptime':'ended_at',
        'Start Station Name':'start_station_name',
        'Start Station ID':'start_staton_id',
        'End Station Name':'end_station_name',
        'End Station ID':'end_station_id',
        'Start Station Latitude':'start_lat',
        'Start Station Longitude':'start_lng',
        'End Station Latitude':'end_lat',
        'End Station Longitude':'end_lng',
        'User Type':'member_casual',
        'Birth Year':'Birth Year',
        'Gender':'Gender',
        'trip duration':'Trip Duration'
        }
    
    for file in file_list:
        url1 = 'https://s3.amazonaws.com/tripdata/' + file + '.zip'
        url2 = 'https://s3.amazonaws.com/tripdata/' + file + '.csv.zip'
        csv_path =  save_directory + file + '.csv'
        
        #attempt to locate the specified file
        response = requests.get(url1)
        if response.status_code == 404:
            response = requests.get(url2)
            if response.status_code == 404:
                print(f'{file} is unavailable')
                continue

        # rename the column headers and replace entries to match the 2021 format, at the end save the data as a csv file
        with closing(response), zipfile.ZipFile(io.BytesIO(response.content)) as zip_contents:
            file_list = zip_contents.namelist()
            with zip_contents.open(file_list[0]) as tempfile:
                bike_df = pd.read_csv(tempfile)
                if 'usertype' in bike_df.columns:
                    bike_df = bike_df.rename(columns=header_format1)     
                elif 'User Type' in bike_df.columns:
                    bike_df = bike_df.rename(columns=header_format2)
                else:
                    print(f'{file}has an unknown format, unable to clean')
                    continue
                bike_df['member_casual'] = bike_df['member_casual'].replace({
                    'Subscriber':'member',
                    'Customer':'casual'
                })
                bike_df.to_csv(csv_path)
            tempfile.close()
        print(f'Successfully extracted and cleaned {file}')

In [62]:
# generate list of file names to download and clean
years = ['2017','2018', '2019',
         '2020','2021']
months = ['01','02','03','04',
          '05','06','07','08',
          '09','10','11','12']

file_names = [f'{year}{month}-citibike-tripdata' for year in years for month in months]
file_names

['201701-citibike-tripdata',
 '201702-citibike-tripdata',
 '201703-citibike-tripdata',
 '201704-citibike-tripdata',
 '201705-citibike-tripdata',
 '201706-citibike-tripdata',
 '201707-citibike-tripdata',
 '201708-citibike-tripdata',
 '201709-citibike-tripdata',
 '201710-citibike-tripdata',
 '201711-citibike-tripdata',
 '201712-citibike-tripdata',
 '201801-citibike-tripdata',
 '201802-citibike-tripdata',
 '201803-citibike-tripdata',
 '201804-citibike-tripdata',
 '201805-citibike-tripdata',
 '201806-citibike-tripdata',
 '201807-citibike-tripdata',
 '201808-citibike-tripdata',
 '201809-citibike-tripdata',
 '201810-citibike-tripdata',
 '201811-citibike-tripdata',
 '201812-citibike-tripdata',
 '201901-citibike-tripdata',
 '201902-citibike-tripdata',
 '201903-citibike-tripdata',
 '201904-citibike-tripdata',
 '201905-citibike-tripdata',
 '201906-citibike-tripdata',
 '201907-citibike-tripdata',
 '201908-citibike-tripdata',
 '201909-citibike-tripdata',
 '201910-citibike-tripdata',
 '201911-citib

In [63]:
extract_clean(save_directory, file_names)

Successfully extracted and cleaned 201701-citibike-tripdata
Successfully extracted and cleaned 201702-citibike-tripdata
Successfully extracted and cleaned 201703-citibike-tripdata
Successfully extracted and cleaned 201704-citibike-tripdata
Successfully extracted and cleaned 201705-citibike-tripdata
Successfully extracted and cleaned 201706-citibike-tripdata
Successfully extracted and cleaned 201707-citibike-tripdata
Successfully extracted and cleaned 201708-citibike-tripdata
Successfully extracted and cleaned 201709-citibike-tripdata
Successfully extracted and cleaned 201710-citibike-tripdata
Successfully extracted and cleaned 201711-citibike-tripdata
Successfully extracted and cleaned 201712-citibike-tripdata
Successfully extracted and cleaned 201801-citibike-tripdata
Successfully extracted and cleaned 201802-citibike-tripdata
Successfully extracted and cleaned 201803-citibike-tripdata
Successfully extracted and cleaned 201804-citibike-tripdata
Successfully extracted and cleaned 20180

BadZipFile: File is not a zip file

In [12]:
csv_df.count()

tripduration               577703
starttime                  577703
stoptime                   577703
start station id           577703
start station name         577703
start station latitude     577703
start station longitude    577703
end station id             559644
end station name           559644
end station latitude       559644
end station longitude      559644
ride_id                    577703
usertype                   577703
birth year                 337382
gender                     577703
dtype: int64

In [13]:
test2_df = csv_df['end station id'].isnull()

test3_df = csv_df[test2_df]

In [14]:
test3_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,ride_id,usertype,birth year,gender
56,547,2013-06-01 00:11:04,2013-06-01 00:20:11,432,E 7 St & Avenue A,40.726218,-73.983799,,,,,17470,Subscriber,1980.0,1
102,537,2013-06-01 00:40:27,2013-06-01 00:49:24,482,W 15 St & 7 Ave,40.739355,-73.999318,,,,,15090,Subscriber,1981.0,1
120,472,2013-06-01 00:47:51,2013-06-01 00:55:43,528,2 Ave & E 31 St,40.742909,-73.977061,,,,,16257,Subscriber,1965.0,1
211,153,2013-06-01 01:32:55,2013-06-01 01:35:28,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,,,,,20106,Subscriber,1987.0,1
289,841,2013-06-01 02:28:10,2013-06-01 02:42:11,509,9 Ave & W 22 St,40.745497,-74.001971,,,,,18792,Customer,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577559,357,2013-06-30 23:34:43,2013-06-30 23:40:40,174,E 25 St & 1 Ave,40.738177,-73.977387,,,,,19258,Subscriber,1986.0,1
577598,702,2013-06-30 23:41:28,2013-06-30 23:53:10,237,E 11 St & 2 Ave,40.730473,-73.986724,,,,,17208,Customer,,0
577622,472,2013-06-30 23:44:15,2013-06-30 23:52:07,439,E 4 St & 2 Ave,40.726281,-73.989780,,,,,17565,Subscriber,1963.0,1
577635,1263,2013-06-30 23:47:47,2013-07-01 00:08:50,472,E 32 St & Park Ave,40.745712,-73.981948,,,,,16576,Subscriber,1956.0,1


good


In [31]:
response_test.status_code

200