In [13]:
import requests
import zipfile
import os
import shutil
import pandas as pd
from contextlib import closing
import io

In [78]:
save_directory = '../../../Data_Sets/citibike/'

def extract_clean(save_directory, file_list):
    header_format1 = {
        'bike_id':'ride_id',
        'starttime':'started_at',
        'stoptime':'ended_at',
        'start station name':'start_station_name',
        'start station id':'start_staton_id',
        'end station name':'end_station_name',
        'end station id':'end_station_id',
        'start station latitude':'start_lat',
        'start station longitude':'start_lng',
        'end station latitude':'end_lat',
        'end station longitude':'end_lng',
        'usertype':'member_casual',
        'birth year':'Birth Year',
        'gender':'Gender',
        'trip duration':'Trip Duration'
        }
    header_format2 = {
        'Bike ID':'ride_id',
        'Start Time':'started_at',
        'Stoptime':'ended_at',
        'Start Station Name':'start_station_name',
        'Start Station ID':'start_staton_id',
        'End Station Name':'end_station_name',
        'End Station ID':'end_station_id',
        'Start Station Latitude':'start_lat',
        'Start Station Longitude':'start_lng',
        'End Station Latitude':'end_lat',
        'End Station Longitude':'end_lng',
        'User Type':'member_casual',
        'Birth Year':'Birth Year',
        'Gender':'Gender',
        'trip duration':'Trip Duration'
        }
    final_df = pd.DataFrame
    for file in file_list:
        url1 = 'https://s3.amazonaws.com/tripdata/' + file + '.zip'
        url2 = 'https://s3.amazonaws.com/tripdata/' + file + '.csv.zip'
        csv_path =  save_directory + file + '.csv'
        
        #attempt to locate the specified file
        response = requests.get(url1)
        if response.status_code != 200:
            response = requests.get(url2)
            if response.status_code != 200:
                print(f'{file} is unavailable')
                continue

        # rename the column headers and replace entries to match the 2021 format, at the end save the data as a csv file
        with closing(response), zipfile.ZipFile(io.BytesIO(response.content)) as zip_contents:
            file_list = zip_contents.namelist()
            with zip_contents.open(file_list[0]) as tempfile:
                bike_df = pd.read_csv(tempfile)
                if 'usertype' in bike_df.columns:
                    bike_df = bike_df.rename(columns=header_format1)     
                elif 'User Type' in bike_df.columns:
                    bike_df = bike_df.rename(columns=header_format2)
                elif 'member_casual' in bike_df.columns:
                    print(f'{file} is in the correct format, saving...')
                else:
                    print(f'{file} has an unknown format, unable to clean')
                    continue
                bike_df['member_casual'] = bike_df['member_casual'].replace({
                    'Subscriber':'member',
                    'Customer':'casual'
                })
                bike_df.to_csv(csv_path)
            tempfile.close()
        print(f'Successfully extracted and cleaned {file}')

In [79]:
# generate list of file names to download and clean
years = ['2021']
months = ['02','03','04',
          '05','06','07','08',
          '09','10','11','12']

file_names = [f'{year}{month}-citibike-tripdata' for year in years for month in months]
file_names

['202102-citibike-tripdata',
 '202103-citibike-tripdata',
 '202104-citibike-tripdata',
 '202105-citibike-tripdata',
 '202106-citibike-tripdata',
 '202107-citibike-tripdata',
 '202108-citibike-tripdata',
 '202109-citibike-tripdata',
 '202110-citibike-tripdata',
 '202111-citibike-tripdata',
 '202112-citibike-tripdata']

In [80]:
extract_clean(save_directory, file_names)

202102-citibike-tripdata is in the correct format, saving...
Successfully extracted and cleaned 202102-citibike-tripdata
202103-citibike-tripdata is in the correct format, saving...
Successfully extracted and cleaned 202103-citibike-tripdata
202104-citibike-tripdata is in the correct format, saving...
Successfully extracted and cleaned 202104-citibike-tripdata
202105-citibike-tripdata is in the correct format, saving...
Successfully extracted and cleaned 202105-citibike-tripdata


  if (await self.run_code(code, result,  async_=asy)):


202106-citibike-tripdata is in the correct format, saving...
Successfully extracted and cleaned 202106-citibike-tripdata
202107-citibike-tripdata is unavailable
202108-citibike-tripdata is unavailable
202109-citibike-tripdata is unavailable
202110-citibike-tripdata is unavailable
202111-citibike-tripdata is unavailable
202112-citibike-tripdata is unavailable


In [12]:
csv_df.count()

tripduration               577703
starttime                  577703
stoptime                   577703
start station id           577703
start station name         577703
start station latitude     577703
start station longitude    577703
end station id             559644
end station name           559644
end station latitude       559644
end station longitude      559644
ride_id                    577703
usertype                   577703
birth year                 337382
gender                     577703
dtype: int64

In [13]:
test2_df = csv_df['end station id'].isnull()

test3_df = csv_df[test2_df]

In [14]:
test3_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,ride_id,usertype,birth year,gender
56,547,2013-06-01 00:11:04,2013-06-01 00:20:11,432,E 7 St & Avenue A,40.726218,-73.983799,,,,,17470,Subscriber,1980.0,1
102,537,2013-06-01 00:40:27,2013-06-01 00:49:24,482,W 15 St & 7 Ave,40.739355,-73.999318,,,,,15090,Subscriber,1981.0,1
120,472,2013-06-01 00:47:51,2013-06-01 00:55:43,528,2 Ave & E 31 St,40.742909,-73.977061,,,,,16257,Subscriber,1965.0,1
211,153,2013-06-01 01:32:55,2013-06-01 01:35:28,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,,,,,20106,Subscriber,1987.0,1
289,841,2013-06-01 02:28:10,2013-06-01 02:42:11,509,9 Ave & W 22 St,40.745497,-74.001971,,,,,18792,Customer,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577559,357,2013-06-30 23:34:43,2013-06-30 23:40:40,174,E 25 St & 1 Ave,40.738177,-73.977387,,,,,19258,Subscriber,1986.0,1
577598,702,2013-06-30 23:41:28,2013-06-30 23:53:10,237,E 11 St & 2 Ave,40.730473,-73.986724,,,,,17208,Customer,,0
577622,472,2013-06-30 23:44:15,2013-06-30 23:52:07,439,E 4 St & 2 Ave,40.726281,-73.989780,,,,,17565,Subscriber,1963.0,1
577635,1263,2013-06-30 23:47:47,2013-07-01 00:08:50,472,E 32 St & Park Ave,40.745712,-73.981948,,,,,16576,Subscriber,1956.0,1


In [92]:
df1 = pd.read_csv(save_directory + '202102-citibike-tripdata.csv').drop(columns='Unnamed: 0')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [93]:
df2 = pd.read_csv(save_directory + '202103-citibike-tripdata.csv').drop(columns='Unnamed: 0')

In [94]:
df3 = pd.concat([df1,df2])

In [100]:
df3 = pd.concat([df3,df1])

In [101]:
df3

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BBA33D73DECE976F,docked_bike,2021-02-26 16:38:54,2021-02-26 16:44:37,E 84 St & Park Ave,7243.04,E 78 St & 2 Ave,7057.07,40.778626,-73.957720,40.772797,-73.955778,casual
1,B63D7AFF9AC5B6D4,docked_bike,2021-02-17 11:09:11,2021-02-17 11:26:47,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
2,52B829195C469D99,docked_bike,2021-02-26 18:33:29,2021-02-26 19:05:41,Macon St & Nostrand Ave,4214.03,Lefferts Pl & Franklin Ave,4222.02,40.680983,-73.950047,40.680342,-73.955769,casual
3,19C84ECA2B468476,docked_bike,2021-02-26 12:48:35,2021-02-26 13:07:24,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
4,C0DDB771E70D9DF5,docked_bike,2021-02-25 17:23:22,2021-02-25 17:28:20,Madison Ave & E 26 St,6131.12,W 37 St & 5 Ave,6398.06,40.742685,-73.986713,40.750380,-73.983390,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649978,84DA58E73B2DF079,docked_bike,2021-02-12 16:01:17,2021-02-12 16:13:53,Irving Ave & DeKalb Ave,4898.02,Irving Ave & DeKalb Ave,4898.02,40.702700,-73.920950,40.702700,-73.920950,member
649979,3A14299B4F7FA1CF,docked_bike,2021-02-17 18:03:24,2021-02-17 18:06:39,E 47 St & Park Ave,6584.12,E 47 St & Park Ave,6584.12,40.755102,-73.974986,40.755103,-73.974987,member
649980,7B86EE3DC7E026BC,docked_bike,2021-02-28 18:58:31,2021-02-28 19:12:51,Cedar St & Evergreen Ave,4721.01,Cedar St & Evergreen Ave,4721.01,40.696710,-73.928070,40.696710,-73.928070,casual
649981,D665B8623FC01285,docked_bike,2021-02-25 09:00:41,2021-02-25 09:13:56,Bedford Ave & Bergen St,4066.15,Cedar St & Evergreen Ave,4721.01,40.676368,-73.952918,40.696710,-73.928070,casual


In [102]:
df4 = pd.DataFrame()

In [103]:
df4

In [104]:
df4 = pd.concat([df1,df4]

TypeError: 'function' object is not subscriptable