In [None]:
import re
import pandas as pd
import itertools
from io import StringIO
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import zipfile
import time


DATA_ROOT = Path("../data")

SECTION_TITLES = [
    "Accounts and Servers: Warnings & Temporary Interventions",
    "Accounts Disabled",
    "Servers Removed",
    "Appeals",
    "Reports",
    "NCMEC",
    "US Gov Info Requests",
    "International Government Information Requests",
    "Preservation Requests",
    "Emergency Requests"
]

MONTHS = {
    "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
    "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
}

In [None]:
# Base URL of the DSA Transparency Portal
base_url = "https://transparency.dsa.ec.europa.eu/explore-data/download"
platform_id = 59  # Discord
page = 1  # Start from the first page

os.makedirs('../data/dsa_zip_files', exist_ok=True)
os.makedirs('../data/dsa_extracted', exist_ok=True)

def unzip_recursive(folder_path):
    """
    Recursively unzip all zip files in a folder and its subfolders
    """
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".zip"):
                zip_path = os.path.join(root, file)
                extract_folder = os.path.join(root, file.replace(".zip", ""))
                if not os.path.exists(extract_folder):
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_folder)
                    print(f"Extracted {zip_path} -> {extract_folder}")
                    # After extracting, recursively unzip inside
                    unzip_recursive(extract_folder)

def concatenate_csvs(folder_path, output_filename):
    """
    Concatenate all CSVs in a folder (and its subfolders) into a single CSV
    """
    all_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))
    if all_files:
        df_list = [pd.read_csv(f) for f in all_files]
        combined_df = pd.concat(df_list, ignore_index=True)
        combined_df.to_csv(output_filename, index=False)
        print(f"Concatenated {len(all_files)} CSVs into {output_filename}")
    else:
        print(f"No CSV files found in {folder_path}")

while True:
    print(f"Processing page {page}...")
    params = {"platform_id": platform_id, "page": page}
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all ZIP links in the "full" column
    links = soup.find_all('a', href=True)
    zip_links = [link['href'] for link in links if 'full' in link['href'] and link['href'].endswith('.zip')]
    
    if not zip_links:
        print("No more ZIP files found. Finished downloading all pages.")
        break
    
    for zip_url in zip_links:
        zip_filename = os.path.join('../data/dsa_zip_files', zip_url.split('/')[-1])
        extract_folder = os.path.join('../data/dsa_extracted', zip_url.split('/')[-1].replace('.zip', ''))

        # sanity check
        if not os.path.exists(zip_filename):
            print(f"Downloading {zip_filename}...")
            zip_response = requests.get(zip_url)
            zip_response.raise_for_status()
            with open(zip_filename, 'wb') as f:
                f.write(zip_response.content)
            print(f"Downloaded {zip_filename}")
        else:
            print(f"{zip_filename} already exists. Skipping download.")
        
        if not os.path.exists(extract_folder):
            with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
                zip_ref.extractall(extract_folder)
            print(f"Extracted to {extract_folder}")
        else:
            print(f"{extract_folder} already exists. Skipping extraction.")
        
        # unzip nested zips
        unzip_recursive(extract_folder)
        
        # concatenate data for each day (folder) in a new csv
        output_csv = os.path.join('../data/dsa_extracted', zip_url.split('/')[-1].replace('.zip', '_combined.csv'))
        if(not os.path.exists(output_csv)):
            concatenate_csvs(extract_folder, output_csv)
    
    page += 1
    time.sleep(1)

# concatenate all the days into one big csv
os.makedirs('../data/dsa_processed', exist_ok=True)

print("Concatenating all combined CSVs into one big dataset...")

# Collect all *_combined.csv files created earlier
daily_combined_files = [
    os.path.join('../data/dsa_extracted', f)
    for f in os.listdir('../data/dsa_extracted')
    if f.endswith('_combined.csv')
]

if daily_combined_files:
    df_list = [pd.read_csv(f, low_memory=False) for f in daily_combined_files]
    all_data = pd.concat(df_list, ignore_index=True)

    # Save to a single file (no date suffix)
    final_output = '../data/dsa_processed/all_discord_data_combined.csv'
    all_data.to_csv(final_output, index=False)
    print(f"All {len(daily_combined_files)} daily CSVs concatenated into {final_output}")
    print(f"Final shape: {all_data.shape}")
else:
    print("No per-day combined CSV files found in ../data/dsa_extracted/")


In [None]:
path = final_output

COLUMNS = [
    "decision_account",
    "decision_ground",
    "decision_ground_reference_url",
    "illegal_content_legal_ground",
    "illegal_content_explanation",
    "incompatible_content_ground",
    "incompatible_content_explanation",
    "incompatible_content_illegal",
    "category",
    "category_addition",
    "category_specification",
    "category_specification_other",
    "content_type",
    "content_type_other",
    "content_date",
    "application_date",
    "source_type",
    "automated_detection",
    "automated_decision",
    "created_at"
]

df = pd.read_csv(
    path,
    usecols=lambda c: c in COLUMNS,
    dtype=str,          # treat all as strings to speed up parsing
    low_memory=False
)

print(f"Loaded dataframe with shape {df.shape}")

for col in ["content_date", "application_date", "created_at"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)

# optimize categorical cols
categorical_cols = [
    "decision_account", "decision_ground",
    "incompatible_content_ground", "category",
    "automated_detection", "automated_decision",
    "platform_name"
]
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

print("Basic cleaning complete.")

output_path = "../data/dsa_processed/discord_cleaned_subset.parquet"
df.to_parquet(output_path, index=False)
print(f"Cleaned + optimized dataset saved to {output_path}")
