In [1]:
import requests
import pandas as pd
from io import BytesIO
import zipfile
from datetime import datetime
import os
import logging



In [None]:
DATE_STR = "20250101"  

RAW_DATA_PATH = "../data/raw"
os.makedirs(RAW_DATA_PATH, exist_ok=True)

GDELT_BASE_URL = "http://data.gdeltproject.org/events"
ZIP_NAME = f"{DATE_STR}.export.CSV.zip"
URL = f"{GDELT_BASE_URL}/{ZIP_NAME}"

print("Target DATE_STR (event date):", DATE_STR)
print("Downloading from:", URL)
response = requests.get(URL)
if response.status_code == 200:
    print("Download successful.")
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        csv_filename = f"{DATE_STR}.export.CSV"
        print("Extracting:", csv_filename)
        z.extract(csv_filename, path=RAW_DATA_PATH)
        extracted_file_path = os.path.join(RAW_DATA_PATH, csv_filename)
        print("File extracted to:", extracted_file_path)
else:
    print("Failed to download file. Status code:", response.status_code)

Target DATE_STR (event date): 20250101
Downloading from: http://data.gdeltproject.org/events/20250101.export.CSV.zip
Download successful.
Extracting: 20250101.export.CSV
File extracted to: ../data/raw/20250101.export.CSV


In [15]:
resp = requests.get(URL, timeout=60)

print("HTTP status:", resp.status_code)

if resp.status_code != 200:
    raise Exception(f"Download failed. Status={resp.status_code}. URL={URL}")

z = zipfile.ZipFile(BytesIO(resp.content))
names = z.namelist()
print("Zip contains:", names)

if len(names) != 1:
    print("Warning: zip contains multiple files. Will read the first one.")
csv_inside = names[0]
extracted_path = os.path.join(RAW_DATA_PATH, csv_inside)
print("Extracting to:", extracted_path)
z.extract(csv_inside, path=RAW_DATA_PATH)

HTTP status: 200
Zip contains: ['20250101.export.CSV']
Extracting to: ../data/raw/20250101.export.CSV


'../data/raw/20250101.export.CSV'

In [None]:
df = pd.read_csv(
    z.open(csv_inside),
    sep="\t",
    header=None,
    dtype=str,          
    low_memory=False
)

print("Raw shape:", df.shape)
df.head()
df.to_csv(extracted_path, index=False)
print("Saved raw data to:", extracted_path)

Raw shape: (71207, 58)
Saved raw data to: ../data/raw/20250101.export.CSV
