In [15]:
import requests
import os
import json
from datetime import datetime
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

class GDELTRetriever:
    def __init__(
        self,
        save_path="./gdelt_results",
        max_retries=3,
        backoff_factor=2,
        headers=HEADERS
    ):
        self.api_url = "https://api.gdeltproject.org/api/v2/doc/doc"  # base API URL
        self.save_path = save_path
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.headers = headers
        if not os.path.exists(save_path):
            os.makedirs(save_path)

    def build_query(
        self,
        query,
        mode="ArtList",
        timespan=None,
        startdatetime=None,
        enddatetime=None,
        format="JSON",
        sort="HybridRel",
        **kwargs
    ):
        """
        Args:
            - query (str): Search query.
            - mode (str): e.g. 'ArtList', 'TimelineVol', 'TimelineTone', 'ToneChart'. Defaults to 'ArtList'.
            - timespan (str): Timespan for search (e.g., '30d' for 30 days). (no need to pass if startdatetime
            is already passed)
            - startdatetime (str): in 'YYYYMMDDHHMMSS' format.
            - enddatetime (str): in 'YYYYMMDDHHMMSS' format.
            - format (str): 'JSON', 'CSV', etc. Defaults to 'JSON'.
            - sort (str): Sort by ... (e.g. HybridRel - relevance, DateDesc - by date in descending order,
            ToneDesc - by tone in descending order).
            - **kwargs: Additional GDELT API parameters.

        Returns:
            dict: Query parameters.
        """
        params = {
            "query": query,
            "mode": mode,
            "format": format,
            "sort": sort
        }
        if timespan:
            params["timespan"] = timespan
        if startdatetime:
            params["startdatetime"] = startdatetime
        if enddatetime:
            params["enddatetime"] = enddatetime
        params.update(kwargs)
        return params

    def fetch_results(self, params):
        for attempt in range(self.max_retries):
            try:
                response = requests.get(self.api_url, params=params, headers=self.headers)
                response.raise_for_status()
                if params.get("format", "JSON").upper() == "JSON":
                    return response.json()
                return response.text
            except requests.exceptions.HTTPError as e:
                if response.status_code == 429:  # Too Many Requests
                    wait_time = self.backoff_factor ** attempt
                    print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"HTTP error: {e}")
                    break
            except requests.exceptions.RequestException as e:
                print(f"Request error: {e}")
                break
        print("Max retries reached. Could not fetch data.")
        return None

    def save_results(
        self,
        data,
        query,
        format="JSON"
    ):
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        query_normalized = query.replace(" ", "_").replace("/", "-")[:50]
        file_name = f"{query_normalized}_{timestamp}.{format.lower()}"
        file_path = os.path.join(self.save_path, file_name)

        try:
            if format.upper() == "JSON":
                with open(file_path, "w", encoding="utf-8") as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
            else:
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(data)
            print(f"Results saved to {file_path}")
            return file_path
        except IOError as e:
            print(f"Error saving file: {e}")
            return None

    def retrieve(
        self,
        query,
        mode="ArtList",
        format="JSON",
        **kwargs,
    ):
        """
        Args:
            - query (str): Search query.
            - mode (str): e.g. 'ArtList', 'TimelineVol', 'TimelineTone', 'ToneChart'. Defaults to 'ArtList'.
            - timespan (str): Timespan for search (e.g., '30d' for 30 days). (no need to pass if startdatetime
            is already passed)
            - startdatetime (str): in 'YYYYMMDDHHMMSS' format.
            - enddatetime (str): in 'YYYYMMDDHHMMSS' format.
            - format (str): 'JSON', 'CSV', etc. Defaults to 'JSON'.
            - sort (str): Sort by ... (e.g. HybridRel - relevance, DateDesc - by date in descending order,
            ToneDesc - by tone in descending order).
            - **kwargs: Additional GDELT API parameters.

        Returns:
            dict or str: Retrieved data.
        """
        params = self.build_query(query, mode=mode, format=format, **kwargs)
        data = self.fetch_results(params)
        if data:
            self.save_results(data, query, format)
        return data


In [16]:
api_url = "https://api.gdeltproject.org/api/v2/doc/do"
parser = GDELTRetriever()

query = "(climate OR global warming)"
data = parser.retrieve(query, mode="ArtList", format="JSON", timespan="2d", maxrecords=1)
print("Retrieval complete.")
# params = parser.build_query(query)
# response = requests.get(api_url, params=params)

Results saved to ./gdelt_results/(climate_OR_global_warming)_20250120180432.json
Retrieval complete.
