In [1]:
# Import packages
# Our standard set:
import numpy as np
import pandas as pd
import os, os.path
# Additionally we will need:
import zipfile
import requests
import urllib.request
# Progress bar:
import tqdm

In [2]:
# 1.) We are generating data URLs;
#     Note that each url is structured in the same manner, the year is the only
#     variable here

urls = [
    'https://s3.amazonaws.com/tripdata/' + str(year) + '-citibike-tripdata.zip'
    for year in range(2012, 2016)
]

In [3]:
urls

['https://s3.amazonaws.com/tripdata/2012-citibike-tripdata.zip',
 'https://s3.amazonaws.com/tripdata/2013-citibike-tripdata.zip',
 'https://s3.amazonaws.com/tripdata/2014-citibike-tripdata.zip',
 'https://s3.amazonaws.com/tripdata/2015-citibike-tripdata.zip']

In [4]:
# We are preparing files to record potential errors / problems

download_log = "log-download.txt"
uznip_log = "log-uzip.txt"

In [5]:
# 2.) Download the file

for url in tqdm.tqdm(urls):
    # Extract the name of the zip archive
    filename = url.split('/')[-1]
    # note that the file name is the last part of url
    
    # if file exists already -> we can skip it
    if os.path.exists(filename[:-4]): # we will omit '.zip' part
        print(filename[:-4], ': done')
        continue
        
    try: # try to download the data
        urllib.request.urlretrieve(url, filename)
    except Exception as e: # if exception is risen -> input info into log file
        with open(download_log, 'a') as f:
            print(url, file = f)
            print(e, file = f)

    # Extract all data in zip archive
    try: 
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('.')
    except Exception as e: 
        with open(uznip_log, 'a') as f:
            print(filename, file = f)
            print(e, file = f)
        

100%|████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.66it/s]

2013-citibike-tripdata : done
2014-citibike-tripdata : done
2015-citibike-tripdata : done





In [8]:
# 3.) Clean up
#     Remove the zip archive

zipfiles = [f for f in os.listdir() if f.endswith('.zip')]
zipfiles

['2013-citibike-tripdata.zip',
 '2015-citibike-tripdata.zip',
 '2014-citibike-tripdata.zip']