#### Downloading Historical Data

In [9]:
import requests
import boto3
import io
import zipfile
import csv
import pandas as pd
import re
import os
from bs4 import BeautifulSoup

##### Downloading all files 

In [None]:
# url = "https://s3.amazonaws.com/capitalbikeshare-data/index.html"
# response = requests.get(url)
# soup = BeautifulSoup(response.content, "html.parser") -- Accessing via s3 instead 

bucket_name = 'capitalbikeshare-data'
prefix = ''

s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

csv_content = ""
for obj in response['Contents']:
    
    if obj['Key'].endswith('.zip'):
        
        response = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
        zipfile_content = zipfile.ZipFile(io.BytesIO(response['Body'].read()))

        for filename in zipfile_content.namelist():

            # Check if file exists already
            if os.path.exists(f"../data/{filename}"):
                continue

            # Extract the content as bytes object 
            csv_content = zipfile_content.read(filename)
         
            # Write to csv 
            with open(f"../data/{filename}", "wb") as fp:
                fp.write(csv_content)

In [None]:
import os

total_size = 0
for dirpath, dirnames, filenames in os.walk('../data/'):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)

total_size # ~4.818 GB

##### Concatenating Files

Before we concatenate we have to determine the different schemas used in each file. 

In [131]:
file_metadata = []
for file in os.listdir("../data"): 

    with open(os.path.join("../data", file), "r") as fp:
        header = fp.readline().strip()
        header = re.sub('"',"", header)


    year = re.match('[0-9]{4}', file)[0]
    try:
        quarter = re.search(r'(?<=Q)[0-9]', file)[0]
    except: 
        quarter = None 
    try: 
        month = re.search(r'(?<=\d{4})\d{2}', file)[0]
    except:
        month = None 

    file_metadata.append({'year':year, 'month':month, 'quarter':quarter, 'header':header})

df = pd.DataFrame(file_metadata)
df.sort_values(["year","month", "quarter"], inplace=True)
df

Unnamed: 0,year,month,quarter,header
28,2010,,,"Duration,Start date,End date,Start station num..."
85,2011,,,"Duration,Start date,End date,Start station num..."
10,2012,,1,"Duration,Start date,End date,Start station num..."
0,2012,,2,"Duration,Start date,End date,Start station num..."
87,2012,,3,"Duration,Start date,End date,Start station num..."
...,...,...,...,...
18,2022,10,,"ride_id,rideable_type,started_at,ended_at,star..."
42,2022,11,,"ride_id,rideable_type,started_at,ended_at,star..."
3,2022,12,,"ride_id,rideable_type,started_at,ended_at,star..."
81,2023,01,,"ride_id,rideable_type,started_at,ended_at,star..."


In [132]:
df.header.value_counts()


Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type                                             53
ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual    35
Name: header, dtype: int64

In [137]:
new_header = "ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual"
df[df['header'] == new_header].iloc[0] # New header first used 04/2020

year                                                    2020
month                                                     04
quarter                                                 None
header     ride_id,rideable_type,started_at,ended_at,star...
Name: 72, dtype: object

`rideable_type`, `start_lat`, `start_lng`, `end_lat`, and `end_lng` are notable new fields in the new schema format. Let's concatenate the files but keep them separate according to the different schema versions. 

In [74]:
os.listdir("../data")

['202002-capitalbikeshare-tripdata.csv',
 '202207-capitalbikeshare-tripdata.csv',
 '202212-capitalbikeshare-tripdata.csv',
 '201810-capitalbikeshare-tripdata.csv',
 '202010-capitalbikeshare-tripdata.csv',
 '201802-capitalbikeshare-tripdata.csv',
 '202007-capitalbikeshare-tripdata.csv',
 '201811-capitalbikeshare-tripdata.csv',
 '2012Q1-capitalbikeshare-tripdata.csv',
 '202302-captialbikeshare-tripdata.csv',
 '201807-capitalbikeshare-tripdata.csv',
 '202101-capitalbikeshare-tripdata.csv',
 '202204-capitalbikeshare-tripdata.csv',
 '202210-capitalbikeshare-tripdata.csv',
 '201904-capitalbikeshare-tripdata.csv',
 '201909-capitalbikeshare-tripdata.csv',
 '202202-capitalbikeshare-tripdata.csv',
 '2015Q1-capitalbikeshare-tripdata.csv',
 '201911-capitalbikeshare-tripdata.csv',
 '2017Q1-capitalbikeshare-tripdata.csv',
 '202201-capitalbikeshare-tripdata.csv',
 '2010-capitalbikeshare-tripdata.csv',
 '2016Q1-capitalbikeshare-tripdata.csv',
 '201801_capitalbikeshare_tripdata.csv',
 '202006-capitalbi