<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/%D0%9A%D0%BE%D0%BF%D1%96%D1%8F_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%BD%D0%B8%D0%BA%D0%B0_%22corpus_downloader_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import json
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin, urlparse

def sanitize_filename(title):
    return re.sub(r'[^a-zA-Z0-9-]', '', '-'.join(title.lower().strip().split())[:50])

def scrape_euronews(url, folder_path, headers):
    print(f"[INFO] Scraping Euronews: {url}")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"[WARNING] Failed to load Euronews page: {url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.select("article a[href^='/']")
    links = list(dict.fromkeys(["https://www.euronews.com" + a['href'] for a in articles]))[:5]

    results = []

    for i, link in enumerate(links):
        print(f"[INFO] Fetching Euronews article {i+1}: {link}")
        art = requests.get(link, headers=headers)
        if art.status_code != 200:
            print(f"[WARNING] Failed to fetch Euronews article {i+1}")
            continue

        art_soup = BeautifulSoup(art.text, "html.parser")

        title_tag = art_soup.find("h1")
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        author_tag = art_soup.find("span", class_="c-article-author__name")
        author = author_tag.get_text(strip=True) if author_tag else "Unknown Author"

        date_tag = art_soup.find("time")
        date_text = date_tag.get("datetime") if date_tag else ""
        try:
            pub_date = datetime.fromisoformat(date_text).strftime("%d %B %Y")
        except Exception:
            pub_date = "Unknown Date"

        paragraphs = art_soup.select("div.c-article-content p")
        content = "\n".join([p.get_text(strip=True) for p in paragraphs])
        if not content:
            print("[WARNING] No article body found in Euronews article")
            continue

        filename_key = sanitize_filename(title)

        # Save TXT
        txt_file = os.path.join(folder_path, f'{filename_key}.txt')
        with open(txt_file, 'w', encoding='utf-8') as f:
            f.write(f"TITLE: {title}\n")
            f.write(f"AUTHOR: {author}\n")
            f.write(f"DATE: {pub_date}\n")
            f.write(f"URL: {link}\n\n")
            f.write(content)
        print(f"[INFO] Saved TXT: {txt_file}")

        # Save JSON
        json_file = os.path.join(folder_path, f'{filename_key}.json')
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump({
                "title": title,
                "author": author,
                "date": pub_date,
                "url": link,
                "content": content
            }, f, ensure_ascii=False, indent=2)
        print(f"[INFO] Saved JSON: {json_file}")

        results.append([title, author, pub_date, link, content])

    return results

def generic_scraper(base_url, folder_path, headers, max_articles=5):
    print(f"[INFO] Scraping generic site: {base_url}")
    try:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            print(f"[WARNING] Failed to load page: {base_url}")
            return []
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all <a> tags with href, filter for likely article links (heuristic)
        anchors = soup.find_all("a", href=True)
        article_links = []
        base_domain = urlparse(base_url).netloc

        for a in anchors:
            href = a['href']
            if href.startswith("http") and urlparse(href).netloc == base_domain:
                article_links.append(href)
            elif href.startswith("/"):
                article_links.append(urljoin(base_url, href))

        # Remove duplicates and limit to max_articles
        article_links = list(dict.fromkeys(article_links))[:max_articles]

        results = []

        for i, link in enumerate(article_links):
            print(f"[INFO] Fetching article {i+1}: {link}")
            art = requests.get(link, headers=headers)
            if art.status_code != 200:
                print(f"[WARNING] Failed to fetch article {i+1} at {link}")
                continue

            art_soup = BeautifulSoup(art.text, "html.parser")

            # Try to extract title
            title_tag = art_soup.find("h1") or art_soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else "No Title"

            # Try to extract date - check <time> tag or meta tags
            date_tag = art_soup.find("time")
            if date_tag and date_tag.has_attr("datetime"):
                date_text = date_tag["datetime"]
            else:
                date_text = ""
            try:
                pub_date = datetime.fromisoformat(date_text).strftime("%d %B %Y") if date_text else "Unknown Date"
            except Exception:
                pub_date = "Unknown Date"

            # Try to extract author (common meta tag)
            author = "Unknown Author"
            author_tag = art_soup.find(attrs={"name":"author"})
            if author_tag and author_tag.has_attr("content"):
                author = author_tag["content"]

            # Extract content heuristically: paragraphs inside <article> or all <p>
            paragraphs = []
            article_tag = art_soup.find("article")
            if article_tag:
                paragraphs = article_tag.find_all("p")
            if not paragraphs:
                paragraphs = art_soup.find_all("p")

            content = "\n".join(p.get_text(strip=True) for p in paragraphs)
            if not content.strip():
                print(f"[WARNING] No content found in article {link}")
                continue

            filename_key = sanitize_filename(title)

            # Save TXT
            txt_file = os.path.join(folder_path, f'{filename_key}.txt')
            with open(txt_file, 'w', encoding='utf-8') as f:
                f.write(f"TITLE: {title}\n")
                f.write(f"AUTHOR: {author}\n")
                f.write(f"DATE: {pub_date}\n")
                f.write(f"URL: {link}\n\n")
                f.write(content)
            print(f"[INFO] Saved TXT: {txt_file}")

            # Save JSON
            json_file = os.path.join(folder_path, f'{filename_key}.json')
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "title": title,
                    "author": author,
                    "date": pub_date,
                    "url": link,
                    "content": content
                }, f, ensure_ascii=False, indent=2)
            print(f"[INFO] Saved JSON: {json_file}")

            results.append([title, author, pub_date, link, content])

        return results
    except Exception as e:
        print(f"[ERROR] Generic scraper error on {base_url}: {e}")
        return []

def scrape_sources(sources):
    today = datetime.now().strftime('%Y-%m-%d')
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9"
    }

    for url in sources:
        domain = urlparse(url).netloc
        folder_path = f'/content/{today}-{domain}'
        os.makedirs(folder_path, exist_ok=True)
        print(f"\n[INFO] Starting scraping for source: {domain}")

        if "euronews.com" in domain:
            results = scrape_euronews(url, folder_path, headers)
        else:
            results = generic_scraper(url, folder_path, headers)

        # Save combined CSV per source
        if results:
            csv_file = os.path.join(folder_path, f'{today}_{domain}_articles.csv')
            with open(csv_file, 'w', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(["Title", "Author", "Date", "URL", "Content"])
                writer.writerows(results)
            print(f"[INFO] Saved combined CSV: {csv_file}")
        else:
            print(f"[WARNING] No articles scraped from {domain}")

    print("\n✅ All done!")

if __name__ == "__main__":
    sources = [
        # Euronews
        "https://www.euronews.com/tag/javier-milei",

        # Irish sources
        "https://www.irishtimes.com/",
        "https://www.independent.ie/",

        # Scottish sources
        "https://www.scotsman.com/news",

        # Australian sources
        "https://www.smh.com.au/",
        "https://www.abc.net.au/news/",

        # Other international
        "https://www.bbc.com/news",
        "https://www.reuters.com/world/"
    ]
    scrape_sources(sources)



[INFO] Starting scraping for source: www.euronews.com
[INFO] Scraping Euronews: https://www.euronews.com/tag/javier-milei
[INFO] Fetching Euronews article 1: https://www.euronews.com/my-europe/2025/05/23/leader-of-neo-nazi-death-cult-extradited-to-us-for-plot-to-poison-jewish-children
[INFO] Saved TXT: /content/2025-05-23-www.euronews.com/leader-of-neo-nazi-death-cult-extradited-to-us-for.txt
[INFO] Saved JSON: /content/2025-05-23-www.euronews.com/leader-of-neo-nazi-death-cult-extradited-to-us-for.json
[INFO] Fetching Euronews article 2: https://www.euronews.com/my-europe/2025/05/23/iran-us-nuclear-talks-made-some-but-not-conclusive-progress-mediator-oman-says
[INFO] Saved TXT: /content/2025-05-23-www.euronews.com/iran-us-nuclear-talks-made-some-but-not-conclusiv.txt
[INFO] Saved JSON: /content/2025-05-23-www.euronews.com/iran-us-nuclear-talks-made-some-but-not-conclusiv.json
[INFO] Fetching Euronews article 3: https://www.euronews.com/my-europe/2025/05/23/several-injured-in-knife-att

In [None]:
from google.colab import files

# Example: Download a single file
files.download('/content/2025-05-23-www.euronews.com/leader-of-neo-nazi-death-cult-extradited-to-us-for.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

# Example: download one specific file
files.download('/content/2025-05-23-www.euronews.com/leader-of-neo-nazi-death-cult-extradited-to-us-for.txt')

# Or to download all files in a folder, you can zip the folder first:
import shutil

folder_path = '/content/2025-05-23-www.euronews.com'
zip_path = '/content/euronews_articles.zip'

shutil.make_archive(zip_path.replace('.zip',''), 'zip', folder_path)
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>