<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/scraper_for_mini_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import re
import json
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def sanitize_filename(title):
    # Allow only letters, numbers, and dashes; limit length to 50 chars
    return re.sub(r'[^a-zA-Z0-9-]', '', '-'.join(title.lower().strip().split())[:50])

def scrape_milei_articles():
    today = datetime.now().strftime('%Y-%m-%d')
    folder_path = f'/content/{today}-milei-articles'
    os.makedirs(folder_path, exist_ok=True)

    search_url = "https://www.euronews.com/tag/javier-milei"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9"
    }

    try:
        print(f"[INFO] Fetching articles from: {search_url}")
        response = requests.get(search_url, headers=headers)
        print(f"[INFO] Status code: {response.status_code}")

        if response.status_code != 200:
            raise Exception(f"Failed to load page (status {response.status_code})")

        soup = BeautifulSoup(response.text, "html.parser")

        # Select article links - adjust selector if necessary
        articles = soup.select("article a[href^='/']")[:5]

        if not articles:
            raise Exception("No articles found. Website structure may have changed.")

        links = ["https://www.euronews.com" + a['href'] for a in articles]
        all_rows = []  # For CSV export

        for i, link in enumerate(links):
            print(f"\n[INFO] Fetching article {i+1}: {link}")
            art = requests.get(link, headers=headers)
            print(f"[INFO] Article status: {art.status_code}")

            if art.status_code != 200:
                print(f"[WARNING] Failed to fetch article {i+1}")
                continue

            art_soup = BeautifulSoup(art.text, "html.parser")

            title_tag = art_soup.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else "No Title"

            author_tag = art_soup.find("span", class_="c-article-author__name")
            author = author_tag.get_text(strip=True) if author_tag else "Unknown Author"

            date_tag = art_soup.find("time")
            date_text = date_tag.get("datetime") if date_tag else ""
            try:
                pub_date = datetime.fromisoformat(date_text).strftime("%d %B %Y")
            except Exception:
                pub_date = "Unknown Date"

            paragraphs = art_soup.select("div.c-article-content p")
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])
            if not content:
                print("[WARNING] No article body found")
                continue

            filename_key = sanitize_filename(title)

            # TXT export
            txt_file = os.path.join(folder_path, f'{filename_key}.txt')
            with open(txt_file, 'w', encoding='utf-8') as f:
                f.write(f"TITLE: {title}\n")
                f.write(f"AUTHOR: {author}\n")
                f.write(f"DATE: {pub_date}\n")
                f.write(f"URL: {link}\n\n")
                f.write(content)
            print(f"[INFO] Saved TXT: {txt_file}")

            # JSON export
            json_file = os.path.join(folder_path, f'{filename_key}.json')
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "title": title,
                    "author": author,
                    "date": pub_date,
                    "url": link,
                    "content": content
                }, f, ensure_ascii=False, indent=2)
            print(f"[INFO] Saved JSON: {json_file}")

            # Append for CSV
            all_rows.append([title, author, pub_date, link, content])

        # CSV export
        csv_file = os.path.join(folder_path, f'{today}_milei_articles.csv')
        with open(csv_file, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Title", "Author", "Date", "URL", "Content"])
            writer.writerows(all_rows)
        print(f"\n✅ Done. Articles saved to: {folder_path}")

        return folder_path  # Return folder path for convenience

    except Exception as e:
        print(f"[ERROR] {str(e)}")

# Run scraper
folder = scrape_milei_articles()



[INFO] Fetching articles from: https://www.euronews.com/tag/javier-milei
[INFO] Status code: 200

[INFO] Fetching article 1: https://www.euronews.com/my-europe/2025/05/23/leader-of-neo-nazi-death-cult-extradited-to-us-for-plot-to-poison-jewish-children
[INFO] Article status: 200
[INFO] Saved TXT: /content/2025-05-23-milei-articles/leader-of-neo-nazi-death-cult-extradited-to-us-for.txt
[INFO] Saved JSON: /content/2025-05-23-milei-articles/leader-of-neo-nazi-death-cult-extradited-to-us-for.json

[INFO] Fetching article 2: https://www.euronews.com/my-europe/2025/05/23/iran-us-nuclear-talks-made-some-but-not-conclusive-progress-mediator-oman-says
[INFO] Article status: 200
[INFO] Saved TXT: /content/2025-05-23-milei-articles/iran-us-nuclear-talks-made-some-but-not-conclusiv.txt
[INFO] Saved JSON: /content/2025-05-23-milei-articles/iran-us-nuclear-talks-made-some-but-not-conclusiv.json

[INFO] Fetching article 3: https://www.euronews.com/my-europe/2025/05/23/several-injured-in-knife-attack-

In [4]:
from google.colab import files
import os

for file in os.listdir(folder):
    files.download(os.path.join(folder, file))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>