Skip to content
Permalink
Browse files

Restartability and retry failed GETs

  • Loading branch information...
bmoscon committed Jan 25, 2019
1 parent a16d103 commit 28137ab47cc45ae73cad2a74d4d3e5d775fe0fb4
Showing with 42 additions and 14 deletions.
  1. +42 −14 scrape.py
@@ -4,6 +4,8 @@
import glob
import os
import shutil
import sys
import time

import requests

@@ -12,21 +14,25 @@
endpoint = 'https://s3-eu-west-1.amazonaws.com/public.bitmex.com/data/trade/{}.csv.gz'


def scrape(year):
if year == 2014:
# First date available
date = dt(2014, 11, 22)
else:
date = dt(year, 1, 1)

def scrape(year, date):
end_date = min(dt(year, 12, 31), dt.today() - timedelta(days=1))


while date <= end_date:
date_str = date.strftime('%Y%m%d')
print("Processing {}...".format(date))
r = requests.get(endpoint.format(date_str))
r.raise_for_status()
count = 0
while True:
r = requests.get(endpoint.format(date_str))
if r.status_code == 200:
break
else:
count += 1
if count == 10:
r.raise_for_status()
print("Error processing {} - {}, trying again".format(date, r.status_code))
time.sleep(10)


with open(date_str, 'wb') as fp:
fp.write(r.content)

@@ -55,7 +61,29 @@ def merge(year):


if __name__ == '__main__':
years = [2014, 2015, 2016, 2017, 2018, 2019]
for year in years:
scrape(year)
merge(year)
if len(sys.argv) == 2:
# if arg is supplied must be in format YYYYMMDD
# will attempt to remove that file, if exists
# in case data is incomplete
year = int(sys.argv[1][:4])
month = int(sys.argv[1][4:6])
day = int(sys.argv[1][6:])
start = dt(year, month, day)
years = list(range(year, dt.now().year + 1))

try:
os.unlink(sys.argv[1])
except FileNotFoundError:
pass

else:
# 2014-11-12 is the first day of data
start = dt(2014, 11, 22)
years = list(range(2014, dt.now().year + 1))

starts = [dt(year, 1, 1) for year in years]
starts[0] = start

for year, start in zip(years, starts):
scrape(year, start)
merge(year)

0 comments on commit 28137ab

Please sign in to comment.
You can’t perform that action at this time.