In [None]:
import pandas as pd
import requests
import zipfile
import io
import os
from collections import defaultdict


In [None]:
# Define output directory
output_dir = "filtered_data"
os.makedirs(output_dir, exist_ok=True)

# Base URL for GDELT event data
gdelt_base_url = "http://data.gdeltproject.org/events/"

# Define date range
start_year, start_month = 2016, 1
end_year, end_month = 2020, 12

In [None]:
def generate_urls():
    urls = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == end_year and month > end_month:
                break
            for day in range(1, 32):  # Adding day to the date string
                date_str = f"{year}{month:02d}{day:02d}"
                url = f"{gdelt_base_url}{date_str}.export.CSV.zip"
                urls.append((date_str[:6], url))  # Return YYYYMM as key
    return urls

urls = generate_urls()

In [None]:
dtype_dict = {
    "FractionDate": float,
    "GoldsteinScale": float,
    "NumMentions": float,
    "NumSources": float,
    "NumArticles": float,
    "AvgTone": float,
    "Actor1Geo_Lat": float,
    "Actor1Geo_Long": float,
    "Actor2Geo_Lat": float,
    "Actor2Geo_Long": float,
    "ActionGeo_Lat": float,
    "ActionGeo_Long": float,
}


italy_code = 'ITA'
china_code = 'CHN'

In [None]:
def download_and_filter():
    monthly_data = defaultdict(list)
    column_names = [
        "GlobalEventID", "SQLDATE", "MonthYear", "Year", "FractionDate", "Actor1Code", "Actor1Name", "Actor1CountryCode", 
        "Actor1KnownGroupCode", "Actor1EthnicCode", "Actor1Religion1Code", "Actor1Religion2Code", "Actor1Type1Code", 
        "Actor1Type2Code", "Actor1Type3Code", "Actor2Code", "Actor2Name", "Actor2CountryCode", "Actor2KnownGroupCode", 
        "Actor2EthnicCode", "Actor2Religion1Code", "Actor2Religion2Code", "Actor2Type1Code", "Actor2Type2Code", "Actor2Type3Code", 
        "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode", "QuadClass", "GoldsteinScale", "NumMentions", 
        "NumSources", "NumArticles", "AvgTone", "Actor1Geo_Type", "Actor1Geo_FullName", "Actor1Geo_CountryCode", 
        "Actor1Geo_ADM1Code", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor1Geo_FeatureID", "Actor2Geo_Type", 
        "Actor2Geo_FullName", "Actor2Geo_CountryCode", "Actor2Geo_ADM1Code", "Actor2Geo_Lat", "Actor2Geo_Long", 
        "Actor2Geo_FeatureID", "ActionGeo_Type", "ActionGeo_FullName", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", 
        "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID", "DATEADDED", "SOURCEURL"
    ]
    
    # Define the country codes for Italy and China
    italy_code = 'ITA'
    china_code = 'CHN'

    # Loop over the (month, url) pairs
    for month, url in urls:
        output_file = os.path.join(output_dir, f"filtered_{month}.csv")
        if os.path.exists(output_file):
            print(f"Skipping {url} for {month}, already processed.")
            continue
        
        print(f"Downloading {url} for month {month}")
        response = requests.get(url)
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                for filename in z.namelist():
                    with z.open(filename) as file:
                        # Read the data file with proper settings
                        df = pd.read_csv(file, sep='\t', header=None, encoding='latin1', low_memory=False, dtype=str)
                        df.columns = column_names  # Assign column names

                        # Filter rows: keep only those where one actor is from CHN and the other from ITA
                        df_filtered = df[
                            ((df['Actor1CountryCode'] == china_code) & (df['Actor2CountryCode'] == italy_code)) |
                            ((df['Actor1CountryCode'] == italy_code) & (df['Actor2CountryCode'] == china_code))
                        ]

                        # Convert numeric columns to floats where possible
                        numeric_cols = [
                            "FractionDate", "GoldsteinScale", "NumMentions", "NumSources", 
                            "NumArticles", "AvgTone", "Actor1Geo_Lat", "Actor1Geo_Long", 
                            "Actor2Geo_Lat", "Actor2Geo_Long", "ActionGeo_Lat", "ActionGeo_Long"
                        ]
                        for col in numeric_cols:
                            if col in df_filtered.columns:
                                df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
                        
                        monthly_data[month].append(df_filtered)
        else:
            print(f"Failed to download {url}")

    # Save one file per month by combining all corresponding dataframes
    for month, dataframes in monthly_data.items():
        if dataframes:
            df_combined = pd.concat(dataframes, ignore_index=True)
            output_file = os.path.join(output_dir, f"filtered_{month}.csv")
            df_combined.to_csv(output_file, index=False, sep='\t')
            print(f"Saved {output_file} with {len(df_combined)} rows")


In [None]:
# Process all URLs
for url in urls:
    print(url)
    download_and_filter()

print("Processing complete.")