In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from openpyxl import load_workbook
import re


# ===============================
# 1) Schlagworte + DDC-Mapping einlesen
# ===============================
idn_df = pd.read_excel("dbsm.xlsx", dtype=str)

# Neue Tabelle mit Schlagwörtern + DDC
mapping_df = pd.read_excel("schlagworte_ddc_gewichtung.xlsx", dtype=str)

# Schlagwörter separat für Textsuche
keywords = mapping_df[mapping_df["Typ"] == "Schlagwort"]["Begriff"].dropna().tolist()
weights = mapping_df.set_index("Begriff")["Gewichtung"].astype(int).to_dict()
systematics = mapping_df.set_index("Begriff")["Systematik"].to_dict()


# ===============================
# 2) Volltext durchsuchen
# ===============================
def find_keywords(idn):
    url_text = f"https://d-nb.info/{idn}/04/text"
    url_pdf = f"https://d-nb.info/{idn}/04/pdf"
    try:
        response = requests.get(url_text, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text().lower()

            word_counts = {}
            for word in keywords:
                pattern = re.compile(rf"\b\w*{re.escape(word.lower())}\w*\b", re.IGNORECASE)
                matches = pattern.findall(text)
                count = len(matches)
                if count > 0:
                    word_counts[word] = (count, weights.get(word, 1), systematics.get(word, ""))

            if not word_counts:
                return url_text, url_pdf, None, 0, None

            total_weight = sum(v[1] for v in word_counts.values())
            if len(word_counts) >= 3 or total_weight >= 3:
                best_word, (best_count, best_weight, best_syst) = max(
                    word_counts.items(),
                    key=lambda x: (x[1][1], x[1][0])
                )
                formatted = "; ".join(
                    f"{w} (Gewicht={wt}, Treffer={ct})"
                    for w, (ct, wt, _) in word_counts.items()
                )
                return url_text, url_pdf, formatted, total_weight, best_syst

    except requests.RequestException:
        return url_text, url_pdf, "Fehler", 0, None

    return url_text, url_pdf, None, 0, None


# Schlagwortsuche anwenden (keine Zeilen löschen!)
idn_df[["URL", "PDF-URL", "Gefundene Schlagwörter", "Gesamtgewichtung", "Gewinner-Systematik"]] = \
    idn_df.iloc[:, 0].apply(find_keywords).apply(pd.Series)
idn_df.rename(columns={idn_df.columns[0]: "IDN"}, inplace=True)


# ===============================
# 3) SRU-Abfrage inkl. DDC
# ===============================
def query_dnb(idn):
    url = f"https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=idn={idn}&recordSchema=MARC21plus-xml"
    response = requests.get(url)
    if response.status_code != 200:
        return None, None, None, None, None, None, None

    root = ET.fromstring(response.text)
    ns = {'marc': 'http://www.loc.gov/MARC21/slim'}
    title, subtitle, author, publisher, year, ddc = None, None, None, None, None, None

    for record in root.findall(".//marc:record", ns):
        for datafield in record.findall("marc:datafield", ns):
            tag = datafield.get("tag")

            if tag == "245":  # Titel & Zusatztitel
                title = datafield.find("marc:subfield[@code='a']", ns)
                title = title.text if title is not None else None
                subtitle = datafield.find("marc:subfield[@code='b']", ns)
                subtitle = subtitle.text if subtitle is not None else None
            elif tag == "100":  # Autor
                author = datafield.find("marc:subfield[@code='a']", ns)
                author = author.text if author is not None else None
            elif tag in ["260", "264"]:  # Verlag & Jahr
                publisher = datafield.find("marc:subfield[@code='b']", ns)
                publisher = publisher.text if publisher is not None else None
                year = datafield.find("marc:subfield[@code='c']", ns)
                year = year.text if year is not None else None
            elif tag in ["082", "083"]:  # DDC
                sub_a = datafield.find("marc:subfield[@code='a']", ns)
                if sub_a is not None:
                    ddc = sub_a.text

    # DBSM prüfen
    dbsm_flag = None
    for holding in root.findall(".//marc:record[@type='Holdings']", ns):
        for datafield in holding.findall("marc:datafield[@tag='852']", ns):
            sub_b = datafield.find("marc:subfield[@code='b']", ns)
            if sub_b is not None and "dbsm" in sub_b.text.lower():
                dbsm_flag = "x"
                break

    return title, subtitle, author, publisher, year, dbsm_flag, ddc


for idx, row in idn_df.iterrows():
    title, subtitle, author, publisher, year, dbsm_flag, ddc = query_dnb(row["IDN"])
    idn_df.at[idx, "Titel"] = title or ""
    idn_df.at[idx, "Zusatztitel"] = subtitle or ""
    idn_df.at[idx, "Autor"] = author or ""
    idn_df.at[idx, "Verlag"] = publisher or ""
    idn_df.at[idx, "Jahr"] = year or ""
    idn_df.at[idx, "DBSM-Bestand"] = dbsm_flag or ""
    idn_df.at[idx, "DDC"] = ddc or ""


# ===============================
# 4) Schlagwort- und DDC-Mapping
# ===============================
def apply_mapping(row):
    total_weight = row["Gesamtgewichtung"] if pd.notna(row["Gesamtgewichtung"]) else 0
    systematik = row["Gewinner-Systematik"]
    found_extra = []

    # Titel + Zusatztitel durchsuchen (falls vorhanden)
    for text in [row.get("Titel"), row.get("Zusatztitel")]:
        if pd.notna(text):
            t = text.lower()
            for w in keywords:
                if re.search(rf"\b{re.escape(w.lower())}\b", t):
                    total_weight += weights.get(w, 1)
                    found_extra.append(w)
                    if not systematik:
                        systematik = systematics.get(w)

    # DDC prüfen
    if pd.notna(row.get("DDC")):
        for ddc_code in str(row["DDC"]).split(";"):
            ddc_code = ddc_code.strip()
            if not ddc_code:
                continue
            if ddc_code in systematics:
                systematik = systematics[ddc_code]
                found_extra.append(f"DDC {ddc_code}")
            for key in systematics.keys():
                if key.startswith(f"{ddc_code} +"):
                    systematik = systematics[key]
                    found_extra.append(key)

    return pd.Series([total_weight, systematik, "; ".join(found_extra)])


idn_df[["Gesamtgewichtung", "Gewinner-Systematik", "Zusätzliche Treffer"]] = \
    idn_df.apply(apply_mapping, axis=1)


# ===============================
# 5) KEIN Filtern mehr!
# ===============================
# Alle IDNs bleiben erhalten, auch ohne Treffer


# ===============================
# 6) Excel speichern + Hyperlinks
# ===============================
output_file = "dbsm-treffer.xlsx"
idn_df.to_excel(output_file, index=False)

wb = load_workbook(output_file)
ws = wb.active
col_url = [c[0] for c in enumerate(ws[1]) if c[1].value == "URL"][0] + 1
col_pdf = [c[0] for c in enumerate(ws[1]) if c[1].value == "PDF-URL"][0] + 1

for row in range(2, ws.max_row + 1):
    if ws.cell(row=row, column=col_url).value:
        ws.cell(row=row, column=col_url).hyperlink = ws.cell(row=row, column=col_url).value
        ws.cell(row=row, column=col_url).style = "Hyperlink"
    if ws.cell(row=row, column=col_pdf).value:
        ws.cell(row=row, column=col_pdf).hyperlink = ws.cell(row=row, column=col_pdf).value
        ws.cell(row=row, column=col_pdf).style = "Hyperlink"

wb.save(output_file)
print("Fertig! Ergebnisse dbsm-treffer.xlsx")


Fertig! Ergebnisse dbsm-treffer.xlsx
