<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Install required libraries (Colab only)
!pip install -q requests beautifulsoup4 python-docx fpdf

# ✅ Import libraries
import os
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from datetime import datetime

# ✅ Scraping function
def scrape_blog(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    try:
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code != 200:
            return None, f"Failed with status code: {res.status_code}"

        soup = BeautifulSoup(res.text, 'html.parser')
        title_tag = soup.find("h1") or soup.title
        title = title_tag.get_text(strip=True) if title_tag else "Untitled"

        content_area = soup.find("article") or soup.find("div", class_="post") or soup.find("main") or soup.body
        if not content_area:
            return None, "No readable content found."

        paragraphs = content_area.find_all("p")
        text_blocks = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20]
        full_text = f"URL: {url}\n\nTITLE: {title}\n\n" + "\n\n".join(text_blocks)

        if len(full_text.strip()) < 100:
            return None, "Content too short to save."

        return title, full_text

    except Exception as e:
        return None, f"Error: {str(e)}"

# ✅ Saving functions
def save_to_txt(title, text, folder):
    path = os.path.join(folder, f"{title[:50].replace(' ', '_')}.txt")
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def save_to_docx(title, text, folder):
    doc = Document()
    doc.add_heading(title, level=1)
    doc.add_paragraph(text)
    doc.save(os.path.join(folder, f"{title[:50].replace(' ', '_')}.docx"))

def save_to_pdf(title, text, folder):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, txt=line)
    pdf.output(os.path.join(folder, f"{title[:50].replace(' ', '_')}.pdf"))

# ✅ Define your blog URLs here
urls = [
    "https://poetryfromtheheart123.blogspot.com/",
    "https://ukrainianstudentvoices.blogspot.com/"
]

# ✅ Create output folder
folder = f"scraped_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(folder, exist_ok=True)

# ✅ Run scraper
for url in urls:
    title, text = scrape_blog(url)
    if title and text:
        print(f"✅ Scraped: {title}")
        save_to_txt(title, text, folder)
        save_to_docx(title, text, folder)
        save_to_pdf(title, text, folder)
    else:
        print(f"⚠️ Failed: {url}\n{text}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
