<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Blog_Scraper_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📄 Blog Scraper for Google Colab
This notebook scrapes text content from blog-style websites and saves it as `.txt`, `.docx`, and `.pdf` files.

In [None]:
# ✅ Install necessary packages
!pip install -q requests beautifulsoup4 python-docx fpdf

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fpdf (setup.py) ... [?25l[?25hdone


In [None]:
# ✅ Import libraries
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from datetime import datetime
import os

In [None]:
# ✅ Define scraping and saving functions
def scrape_blog(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code != 200:
            return None, "Failed to load URL"

        soup = BeautifulSoup(res.text, 'html.parser')
        title_tag = soup.find("h1") or soup.title
        title = title_tag.get_text(strip=True) if title_tag else "Untitled"

        content_area = soup.find("article") or soup.find("div", class_="post") or soup.body
        text_blocks = []

        for el in content_area.find_all(["p", "h1", "h2", "ul", "ol"]):
            if el.name.startswith("h"):
                text_blocks.append("\n" + el.get_text(strip=True).upper() + "\n")
            elif el.name in ["ul", "ol"]:
                for li in el.find_all("li"):
                    text_blocks.append("• " + li.get_text(strip=True))
            else:
                para = el.get_text(strip=True)
                if len(para) > 30:
                    text_blocks.append(para)

        full_text = f"URL: {url}\nTITLE: {title}\n\n" + "\n\n".join(text_blocks)
        return title, full_text

    except Exception as e:
        return None, f"Error: {str(e)}"

def save_to_txt(title, text, folder):
    path = os.path.join(folder, f"{title[:50]}.txt")
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def save_to_docx(title, text, folder):
    doc = Document()
    doc.add_heading(title, level=1)
    doc.add_paragraph(text)
    doc.save(os.path.join(folder, f"{title[:50]}.docx"))

def save_to_pdf(title, text, folder):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split('\n'):
        pdf.cell(200, 10, txt=line[:90], ln=True)
    pdf.output(os.path.join(folder, f"{title[:50]}.pdf"))

In [None]:
# ✅ Scrape from a list of blog URLs
urls = [
    "https://poetryfromtheheart123.blogspot.com/",
    "https://ukrainianstudentvoices.blogspot.com/",
]

folder = f"scraped_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(folder, exist_ok=True)

for url in urls:
    title, text = scrape_blog(url)
    if title and text:
        print(f"✅ Scraped: {title}")
        save_to_txt(title, text, folder)
        save_to_docx(title, text, folder)
        save_to_pdf(title, text, folder)
    else:
        print(f"⚠️ Failed: {url}\n{text}")