In [11]:
import requests
from bs4 import BeautifulSoup
import pdfplumber
import json
import re
from pathlib import Path

In [2]:
BASE_DIR = Path("./data")
BASE_DIR.mkdir(exist_ok=True)

In [3]:
urls = {
    "ai": "https://abit.itmo.ru/program/master/ai",
    "ai_product": "https://abit.itmo.ru/program/master/ai_product"
}

In [12]:
def fetch_program_info(name, url):
    """"""
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    desc_block = soup.find("div", {"class": "program-description"}) or soup
    description = " ".join(desc_block.stripped_strings)

    pdf_links = [a["href"] for a in soup.find_all("a", href=True) if a["href"].endswith(".pdf")]
    return {
        "name": name,
        "description": description,
        "pdf_links": pdf_links
    }

programs = {name: fetch_program_info(name, url) for name, url in urls.items()}

In [13]:
def download_pdf(url, filename):
    """"""
    if not url.startswith("http"):
        url = "https://abit.itmo.ru" + url
    r = requests.get(url)
    r.raise_for_status()
    with open(filename, "wb") as f:
        f.write(r.content)
    return filename

In [14]:
def parse_curriculum_tables(pdf_path):
    """"""
    courses = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    if not row or len(row) < 3:
                        continue
                    try:
                        semester = row[0].strip()
                        name = row[1].strip()
                        credits = row[-2].strip()
                        hours = row[-1].strip()
                    except Exception:
                        continue

                    # отбрасываем заголовки
                    if not name or "дисципл" in name.lower() or "модуль" in name.lower():
                        continue

                    courses.append({
                        "semester": semester,
                        "name": name,
                        "credits": credits,
                        "hours": hours
                    })
    return courses

In [15]:
def program_to_json(info, pdf_file, out_file):
    """"""
    courses = parse_curriculum_tables(pdf_file)
    result = {
        "program_name": info["name"],
        "description": info["description"],
        "courses": courses
    }
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    return result

In [16]:
all_results = {}

for name, info in programs.items():
    if info["pdf_links"]:
        pdf_file = BASE_DIR / f"{name}.pdf"
        download_pdf(info["pdf_links"][0], pdf_file)
        out_file = BASE_DIR / f"{name}.json"
        result = program_to_json(info, pdf_file, out_file)
        all_results[name] = result
    else:
        all_results[name] = {
            "program_name": name,
            "description": info["description"],
            "courses": []
        }

In [17]:
with open(BASE_DIR / "all_programs.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)