<a href="https://colab.research.google.com/github/atasevski/gdelt-echo-chambers/blob/main/notebooks/02_data_download_and_merge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://github.com/atasevski/gdelt-echo-chambers.git
%cd gdelt-echo-chambers
!mkdir -p data src results

fatal: destination path 'gdelt-echo-chambers' already exists and is not an empty directory.
/content/gdelt-echo-chambers


In [3]:
%%writefile src/download_gkg.py
# ... (кодот од download_gkg.py што го имаш во canvas) ...
import os
import requests
import zipfile
import io
from datetime import datetime
from urllib.parse import urljoin

BASE_URL = "http://data.gdeltproject.org/gdeltv2/"
MASTER_LIST_URL = urljoin(BASE_URL, "masterfilelist.txt")

def download_gkg_data(start_date: str, end_date: str, save_dir: str = "data/"):
    """
    Download and extract GKG .csv.zip files from GDELT for a given inclusive date range.
    It reads the masterfilelist.txt to get exact timestamped filenames.

    Args:
        start_date (str): Start date in YYYYMMDD format, e.g., '20250701'.
        end_date (str):   End date in YYYYMMDD format, e.g., '20250707'.
        save_dir (str):   Directory path where data will be saved. Defaults to 'data/'.
    """
    os.makedirs(save_dir, exist_ok=True)

    # Fetch master file list
    print(f"Fetching master file list from {MASTER_LIST_URL}...")
    resp = requests.get(MASTER_LIST_URL, timeout=60)
    resp.raise_for_status()
    lines = resp.text.strip().splitlines()

    # Parse dates for filtering
    fmt_full = "%Y%m%d%H%M%S"
    fmt_day = "%Y%m%d"
    start_dt = datetime.strptime(start_date, fmt_day).date()
    end_dt = datetime.strptime(end_date, fmt_day).date()

    selected_urls = []
    for entry in lines:
        # Each entry may be full URL or relative path
        filename = entry.split('/')[-1]
        if not filename.endswith('.gkg.csv.zip'):
            continue
        try:
            ts = filename.split('.gkg.csv.zip')[0]
            dt = datetime.strptime(ts, fmt_full)
        except ValueError:
            continue
        dt_date = dt.date()
        if start_dt <= dt_date <= end_dt:
            # Build full URL
            url = entry if entry.startswith('http') else urljoin(BASE_URL, filename)
            selected_urls.append(url)

    if not selected_urls:
        print(f"No GKG files found between {start_date} and {end_date}.")
        return

    # Download and extract
    for url in selected_urls:
        print(f"Downloading: {url}")
        try:
            r = requests.get(url, timeout=60)
            r.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to download {url}: {e}")
            continue

        with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
            for member in zf.namelist():
                if member.endswith('.csv'):
                    # Save with timestamped filename
                    date_part = member.split('.gkg.csv')[0]
                    out_name = f"{date_part}.gkg.csv"
                    out_path = os.path.join(save_dir, out_name)
                    with open(out_path, 'wb') as f_out:
                        f_out.write(zf.read(member))
                    print(f"Saved: {out_path}")
if __name__ == '__main__':
  download_gkg_data('20250701', '20250707', save_dir='data/')

Overwriting src/download_gkg.py


In [1]:
%%writefile src/process_gkg.py

import os
import pandas as pd
from glob import glob

# Индекси на колони што ги селектираме од CSV (позиции во табелата без header)
SELECTED_IDX = {
    'GKGRECORDID': 0,
    'DATE': 1,
    'SourceCollectionIdentifier': 2,
    'SourceCommonName': 3,
    'EnhancedThemes': 17,
    'V2Tone': 15,
    'EnhancedPersons': 19,
    'EnhancedOrganizations': 21,
    'EnhancedLocations': 18,
    'EnhancedURLs': -1  # последна колона
}


def load_and_concat(data_dir: str = "data/") -> pd.DataFrame:
    """
    Вчитува, селектира и конкатенира GKG CSV фајлови во еден DataFrame.

    - Чита ги сите *.gkg.csv фајлови во data_dir без header.
    - Селектира колони според позиции (SELECTED_IDX) и ги ренамува.
    - Враќа обединет DataFrame.
    """
    pattern = os.path.join(data_dir, "*.gkg.csv")
    files = sorted(glob(pattern))
    if not files:
        raise FileNotFoundError(f"No GKG CSV files found in {data_dir}")

    dfs = []
    for fp in files:
        print(f"Loading {fp}")
        # Читање таб делимитед без header, користи latin-1 за да избегне UnicodeDecodeError
        df = pd.read_csv(
            fp,
            sep="\t",
            header=None,
            dtype=str,
            encoding='latin-1',
            engine='python',
            on_bad_lines='warn'
        )
        # Определи последна колона
        last_col = df.shape[1] - 1

        # Избери колони и именувај
        cols = []
        names = []
        for name, idx in SELECTED_IDX.items():
            actual = last_col if idx == -1 else idx
            cols.append(actual)
            names.append(name)

        sub = df.iloc[:, cols]
        sub.columns = names
        dfs.append(sub)

    combined = pd.concat(dfs, ignore_index=True)
    print(f"Combined DataFrame shape: {combined.shape}")
    return combined


def save_combined(df: pd.DataFrame, out_path: str = "results/combined_gkg.parquet"):
    """
    Сочува DataFrame во Parquet формат.
    """
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_parquet(out_path, index=False)
    print(f"Saved combined dataset to {out_path}")


if __name__ == '__main__':
    combined_df = load_and_concat("data/")
    save_combined(combined_df)

Writing src/process_gkg.py


In [4]:
from src.download_gkg import download_gkg_data
download_gkg_data('20250701', '20250707', save_dir='data/')

Fetching master file list from http://data.gdeltproject.org/gdeltv2/masterfilelist.txt...
Downloading: http://data.gdeltproject.org/gdeltv2/20250702021500.gkg.csv.zip
Saved: data/20250702021500.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702023000.gkg.csv.zip
Saved: data/20250702023000.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702024500.gkg.csv.zip
Saved: data/20250702024500.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702030000.gkg.csv.zip
Saved: data/20250702030000.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702031500.gkg.csv.zip
Saved: data/20250702031500.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702033000.gkg.csv.zip
Saved: data/20250702033000.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702034500.gkg.csv.zip
Saved: data/20250702034500.gkg.csv
Downloading: http://data.gdeltproject.org/gdeltv2/20250702040000.gkg.csv.zip
Saved: data/20250702040000.gkg.csv
Downloading: h

In [5]:
from src.process_gkg import load_and_concat, save_combined
combined_df = load_and_concat('data/')
combined_df.head()

Loading data/20250702021500.gkg.csv
Loading data/20250702023000.gkg.csv
Loading data/20250702024500.gkg.csv
Loading data/20250702030000.gkg.csv
Loading data/20250702031500.gkg.csv
Loading data/20250702033000.gkg.csv
Loading data/20250702034500.gkg.csv
Loading data/20250702040000.gkg.csv
Loading data/20250702041500.gkg.csv
Loading data/20250702043000.gkg.csv
Loading data/20250702044500.gkg.csv
Loading data/20250702050000.gkg.csv
Loading data/20250702051500.gkg.csv
Loading data/20250702053000.gkg.csv
Loading data/20250702054500.gkg.csv
Loading data/20250702060000.gkg.csv
Loading data/20250702061500.gkg.csv
Loading data/20250702063000.gkg.csv
Loading data/20250702064500.gkg.csv
Loading data/20250702070000.gkg.csv
Loading data/20250702071500.gkg.csv
Loading data/20250702073000.gkg.csv
Loading data/20250702074500.gkg.csv
Loading data/20250702080000.gkg.csv
Loading data/20250702081500.gkg.csv
Loading data/20250702083000.gkg.csv
Loading data/20250702084500.gkg.csv
Loading data/20250702090000.


  df = pd.read_csv(


Loading data/20250702153000.gkg.csv
Loading data/20250702154500.gkg.csv
Loading data/20250702160000.gkg.csv
Loading data/20250702161500.gkg.csv
Loading data/20250702163000.gkg.csv
Loading data/20250702164500.gkg.csv
Loading data/20250702170000.gkg.csv
Loading data/20250702171500.gkg.csv
Loading data/20250702173000.gkg.csv
Loading data/20250702174500.gkg.csv
Loading data/20250702180000.gkg.csv
Loading data/20250702181500.gkg.csv
Loading data/20250702183000.gkg.csv
Loading data/20250702184500.gkg.csv
Loading data/20250702190000.gkg.csv
Loading data/20250702191500.gkg.csv
Loading data/20250702193000.gkg.csv
Loading data/20250702194500.gkg.csv
Loading data/20250702200000.gkg.csv
Loading data/20250702201500.gkg.csv
Loading data/20250702203000.gkg.csv
Loading data/20250702204500.gkg.csv
Loading data/20250702210000.gkg.csv
Loading data/20250702211500.gkg.csv
Loading data/20250702213000.gkg.csv
Loading data/20250702214500.gkg.csv
Loading data/20250702220000.gkg.csv
Loading data/20250702221500.

Unnamed: 0,GKGRECORDID,DATE,SourceCollectionIdentifier,SourceCommonName,EnhancedThemes,V2Tone,EnhancedPersons,EnhancedOrganizations,EnhancedLocations,EnhancedURLs
0,20250702021500-0,20250702021500,1,yahoo.com,"wc:174,c12.1:9,c12.10:13,c12.12:8,c12.13:1,c12...","-8.2051282051282,0.512820512820513,8.717948717...",,https://youtube.com/c/yahoonews\;https://youtu...,https://media.zenfs.com/en/wreg_articles_784/6...,<PAGE_PRECISEPUBTIMESTAMP>20250702000600</PAGE...
1,20250702021500-1,20250702021500,1,justjared.com,"wc:50,c12.1:3,c12.10:6,c12.12:3,c12.13:1,c12.1...","-1.92307692307692,0,1.92307692307692,1.9230769...",,,https://cdn01.justjared.com/wp-content/uploads...,<PAGE_LINKS>https://www.justjared.com/2025/07/...
2,20250702021500-2,20250702021500,1,tvguide.co.uk,"wc:74,c1.3:1,c12.1:5,c12.10:12,c12.12:3,c12.13...",-404414174,,,https://tv.assets.pressassociation.io/ee701b40...,<PAGE_LINKS>https://www.tvguide.co.uk/channel/...
3,20250702021500-3,20250702021500,1,yahoo.com,"wc:335,c12.1:22,c12.10:30,c12.12:4,c12.13:14,c...","-2.1680216802168,1.3550135501355,3.52303523035...",,https://youtube.com/watch?v=ADbu_ifqSBA;https:...,https://media.zenfs.com/en/Benzinga/dd3b2c7dee...,<PAGE_LINKS>https://www.benzinga.com/24/11/421...
4,20250702021500-4,20250702021500,1,gamerant.com,"wc:867,c1.2:5,c12.1:94,c12.10:127,c12.12:30,c1...","2.2289766970618,5.77507598784195,3.54609929078...",https://static0.gamerantimages.com/wordpress/w...,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,https://static0.gamerantimages.com/wordpress/w...,<PAGE_LINKS>https://gamerant.com/db/video-game...
