#### Downloading Historical Data

In [6]:
import requests
import boto3
import io
import zipfile
import csv
import pandas as pd
import re
import os
from bs4 import BeautifulSoup

##### Downloading all files 

In [None]:
!mkdir data_temp

In [152]:
# url = "https://s3.amazonaws.com/capitalbikeshare-data/index.html"
# response = requests.get(url)
# soup = BeautifulSoup(response.content, "html.parser") -- Accessing via s3 instead 

bucket_name = 'capitalbikeshare-data'
prefix = ''

s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

csv_content = ""
for obj in response['Contents']:
    
    if obj['Key'].endswith('.zip'):
        
        response = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
        zipfile_content = zipfile.ZipFile(io.BytesIO(response['Body'].read()))

        for filename in zipfile_content.namelist():

            # Check if file exists already
            if os.path.exists(f"./data_temp/{filename}"):
                continue

            # Extract the content as bytes object 
            csv_content = zipfile_content.read(filename)
         
            # Write to csv 
            with open(f"./data_temp/{filename}", "wb") as fp:
                fp.write(csv_content)

In [5]:
import os

total_size = 0
for dirpath, dirnames, filenames in os.walk('./data_temp/'):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)

total_size # ~4.819 GB

5174216799

##### Concatenating Files

Before we concatenate we have to determine the different schemas used in each file. 

In [7]:
file_metadata = []
for file in os.listdir("./data_temp"): 

    with open(os.path.join("./data_temp", file), "r") as fp:
        header = fp.readline().strip()
        header = re.sub('"',"", header)


    year = re.match('[0-9]{4}', file)[0]
    try:
        quarter = re.search(r'(?<=Q)[0-9]', file)[0]
    except: 
        quarter = "" 
    try: 
        month = re.search(r'(?<=\d{4})\d{2}', file)[0]
    except:
        month = "" 

    file_metadata.append({'year':year, 'month':month, 'quarter':quarter, 'header':header, 'filepath':'./data_temp/' + file})

df = pd.DataFrame(file_metadata)
df.sort_values(["year","month", "quarter"], inplace=True)
df

Unnamed: 0,year,month,quarter,header,filepath
28,2010,,,"Duration,Start date,End date,Start station num...",./data/2010-capitalbikeshare-tripdata.csv
85,2011,,,"Duration,Start date,End date,Start station num...",./data/2011-capitalbikeshare-tripdata.csv
10,2012,,1,"Duration,Start date,End date,Start station num...",./data/2012Q1-capitalbikeshare-tripdata.csv
0,2012,,2,"Duration,Start date,End date,Start station num...",./data/2012Q2-capitalbikeshare-tripdata.csv
87,2012,,3,"Duration,Start date,End date,Start station num...",./data/2012Q3-capitalbikeshare-tripdata.csv
...,...,...,...,...,...
18,2022,10,,"ride_id,rideable_type,started_at,ended_at,star...",./data/202210-capitalbikeshare-tripdata.csv
42,2022,11,,"ride_id,rideable_type,started_at,ended_at,star...",./data/202211-capitalbikeshare-tripdata.csv
3,2022,12,,"ride_id,rideable_type,started_at,ended_at,star...",./data/202212-capitalbikeshare-tripdata.csv
81,2023,01,,"ride_id,rideable_type,started_at,ended_at,star...",./data/202301-capitalbikeshare-tripdata.csv


In [8]:
df.header.value_counts()


Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type                                             53
ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual    35
Name: header, dtype: int64

In [9]:
new_header = "ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual"
df[df['header'] == new_header].iloc[0] # New header first used 04/2020

year                                                     2020
month                                                      04
quarter                                                      
header      ride_id,rideable_type,started_at,ended_at,star...
filepath          ./data/202004-capitalbikeshare-tripdata.csv
Name: 72, dtype: object

`rideable_type`, `start_lat`, `start_lng`, `end_lat`, and `end_lng` are notable new fields in the new schema format. Let's concatenate the files but keep them separate according to the different schema versions. 

In [11]:
old_schema_files = list(df['filepath'][df['header'] != new_header])
new_schema_files = list(df['filepath'][df['header'] == new_header])

In [2]:
import glob

file_gen = (pd.read_csv(file) for file in old_schema_files)

df = pd.concat(file_gen, ignore_index=True)
df.columns = [re.sub(" ","_", col.lower()) for col in df.columns]
df.to_csv("data/bks_tripdata_v1.csv", index=False)

In [17]:
file_gen = (pd.read_csv(file) for file in new_schema_files)
df = pd.concat(file_gen, ignore_index=True)
df.to_csv("data/bks_tripdata_v2.csv", index=False)

  objs = list(objs)


Let's move our new files to the main `../data` directory and delete `./data_temp`: 

In [16]:
%%bash 

mkdir ../data

mv ./data/bks*.csv ../data/

rm -rf ./data_temp