### Fetch Raw Data

Run this notebook once to fetch the raw data file from a remote URL, extract the files (if needed) and make them available in the appropriate data directory. The fetch URL and the data path(s) are loaded from the `.env` file. 

In [None]:
# Run once to load environment variables into the notebook environment
# If .env file is updated (rare), restart the kernel and rerun

%load_ext dotenv
%dotenv

In [None]:
import os
import requests

# Get raw data URL from environment variable
RAW_DATA_URL = os.getenv("RAW_DATA_URL")
RAW_DATA_PATH = os.getenv("RAW_DATA_PATH")

print("Raw data fetch URL:", RAW_DATA_URL)
print("Raw data directory:", RAW_DATA_PATH)

Raw data fetch URL: https://archive.ics.uci.edu/static/public/352/online+retail.zip
Raw data directory: ../data/raw


In [3]:
# Utility function to fetch and extract data
def fetch_and_extract_data(url, save_path):
    response = requests.get(url)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    print(f"Fetching data from {url}...")

    with open(save_path, 'wb') as file:
        file.write(response.content)
    print(f"Data fetched and saved to {save_path}")

    # Check if file is a zip and extract if so
    if save_path.endswith('.zip'):
        import zipfile
        with zipfile.ZipFile(save_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.dirname(save_path))
        print(f"Data files(s) extracted to {os.path.dirname(save_path)}")

In [4]:
# Fetch and extract the raw data
fetch_and_extract_data(RAW_DATA_URL, os.path.join(RAW_DATA_PATH, 'online_retail.zip'))

Fetching data from https://archive.ics.uci.edu/static/public/352/online+retail.zip...
Data fetched and saved to ../data/raw\online_retail.zip
Data files(s) extracted to ../data/raw
