In [5]:
import os
import requests
from datetime import datetime, timezone
from tqdm import tqdm

from common import get_paths, hour_urls, ensure_dir

In [None]:
BASE_DIR = r"C:\Users\EL040\Desktop\MS_3rd-Project\basemodel"

START_DATE = "2025-01-01" 
END_DATE   = "2025-01-07" 
TIMEOUT = 120

START = f"{START_DATE}T00:00:00Z"
END   = f"{END_DATE}T23:59:59Z"

paths = get_paths(BASE_DIR)
train_raw = paths["data_train_raw"]
test_raw  = paths["data_test_raw"]
ensure_dir(train_raw)
ensure_dir(test_raw)

start_utc = datetime.fromisoformat(START.replace("Z", "+00:00")).astimezone(timezone.utc)
end_utc   = datetime.fromisoformat(END.replace("Z", "+00:00")).astimezone(timezone.utc)

In [None]:
def silent_download_one(url: str, out_path: str, timeout: int = TIMEOUT) -> bool:
    try:
        if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
            return False
        with requests.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        return True
    except Exception as e:
        with open(out_path + ".ERR.txt", "w", encoding="utf-8") as ef:
            ef.write(f"Failed to download {url}\n{repr(e)}\n")
        return False

In [None]:
hours = list(hour_urls(start_utc, end_utc))
n_total = len(hours)
mid_idx = n_total // 2   # 앞 절반 → train, 뒤 절반 → test

print(f"총 {n_total}개 시간 파일 (train={mid_idx}, test={n_total-mid_idx}) 다운로드 시작")

ok_train = ok_test = skip_train = skip_test = fail_cnt = 0

with tqdm(total=n_total, ncols=30, desc="다운로드 진행률") as pbar:
    for i, (hour_dt, url) in enumerate(hours):
        target_dir = train_raw if i < mid_idx else test_raw
        out_name = f"{hour_dt.strftime('%Y-%m-%d')}-{hour_dt.hour}.json.gz"
        out_path = os.path.join(target_dir, out_name)

        if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
            if i < mid_idx: skip_train += 1
            else:           skip_test  += 1
            pbar.update(1)
            continue

        result = silent_download_one(url, out_path)
        if result:
            if i < mid_idx: ok_train += 1
            else:           ok_test  += 1
        else:
            fail_cnt += 1

        pbar.update(1)

print(
    f"\n다운로드 완료\n"
    f"  train → 성공 {ok_train}, 건너뜀 {skip_train}\n"
    f"  test  → 성공 {ok_test}, 건너뜀 {skip_test}\n"
    f"  실패: {fail_cnt}"
)

총 168개 시간 파일 (train=84, test=84) 다운로드 시작


1주일 진행률: 100%|█| 168/168 

다운로드 완료
  train → 성공 84, 건너뜀 0
  test  → 성공 84, 건너뜀 0
  실패: 0



