In [1]:
import requests
import datetime
import pandas as pd
import pickle
import time
import os

with open('doc_urls.pickle', 'rb') as f:
    doc_urls = pickle.load(f)

base_url = "https://files.zillowstatic.com/research/public_csvs/"
alt_base_url = "https://files.zillowstatic.com/research/public_v2/"
end_url = f"?t={int(datetime.datetime.now().timestamp())}"

Geographies = [
    "Metro",
    "State",
    "County",
    "City",
    "Zip",
    "Neighborhood"
]

if not os.path.isdir('./data/'):
    print('Creating directory: ' + './data/')
    os.mkdir('./data/')

def download_file(url, path):
    response = requests.get(url)
    if response.status_code != 200:
        print(f'Failed to download file with response code {response.status_code}: ' + url)
        return

    target_dir = './data/' + path.split('/')[0]
    # Open a file in write-binary mode
    if not os.path.isdir(target_dir):
        print('Creating directory: ' + target_dir)
        os.mkdir(target_dir)

    with open(('./data/' + path), 'wb') as file:
        # Write the content of the response to the file
        file.write(response.content)

    print('Downloaded file: ./data/' + path)
    
    df = pd.read_csv(('./data/' + path))
    if len(df) == 0:
        print('Empty file, Removing: data/' + path)
        os.remove(('./data/' + path))
    return
    

In [None]:
for description, url in doc_urls.items():
    for geo in Geographies:
        try:
            _url = base_url + url.replace('Metro', geo) + end_url
            print(f"Downloading {description} for {geo} with url {_url}")
            download_file(_url, url.replace('Metro', geo))
        except:
            print(f"Failed to download {description} for {geo} with url {_url}")
            try:
                _url = alt_base_url + url.replace('Metro', geo) + end_url
                print(f"Downloading {description} for {geo} with url {_url}")
                download_file(_url, url.replace('Metro', geo))
            except Exception as e:
                print(f"Failed to download {description} for {geo} with url {_url}")
                print(f"Error: {e}")
                continue
    time.sleep(0.01)