In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET

def query_dnb(idn):
    url = f"https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=idn={idn}&recordSchema=MARC21-xml"
    response = requests.get(url)
    
    if response.status_code != 200:
        return None, None, None, None
    
    root = ET.fromstring(response.text)
    
    ns = {'marc': 'http://www.loc.gov/MARC21/slim'}
    title, author, publisher, year = None, None, None, None
    
    for record in root.findall(".//marc:record", ns):
        for datafield in record.findall("marc:datafield", ns):
            tag = datafield.get("tag")
            
            if tag == "245":  # Titel
                title = datafield.find("marc:subfield[@code='a']", ns)
                title = title.text if title is not None else None
            elif tag == "100":  # Autor
                author = datafield.find("marc:subfield[@code='a']", ns)
                author = author.text if author is not None else None
            elif tag in ["260", "264"]:  # Verlag & Jahr
                publisher = datafield.find("marc:subfield[@code='b']", ns)
                publisher = publisher.text if publisher is not None else None
                year = datafield.find("marc:subfield[@code='c']", ns)
                year = year.text if year is not None else None
                
        break  # Nur ersten Treffer nehmen
    
    return title, author, publisher, year

def process_excel(file_path):
    df = pd.read_excel(file_path)
    df.columns = df.columns.str.strip()  # Entfernt mögliche Leerzeichen in Spaltennamen
    
    if "idn" not in df.columns:
        print("Fehler: Die Spalte 'idn' wurde nicht gefunden. Verfügbare Spalten:", df.columns)
        return
    
    df["Titel"] = ""
    df["Autor"] = ""
    df["Verlag"] = ""
    df["Jahr"] = ""
    
    for index, row in df.iterrows():
        idn = row["idn"]
        if pd.notna(idn):
            title, author, publisher, year = query_dnb(str(idn))
            df.at[index, "Titel"] = title if title else ""
            df.at[index, "Autor"] = author if author else ""
            df.at[index, "Verlag"] = publisher if publisher else ""
            df.at[index, "Jahr"] = year if year else ""
    
    output_file = "output.xlsx"
    df.to_excel(output_file, index=False)
    print(f"Ergebnisse wurden in {output_file} gespeichert.")
    
    # Entferne Zeilen mit Verlagen aus der Verlage.xlsx
    remove_publishers("output.xlsx", "Verlage.xlsx")

def remove_publishers(output_file, verlage_file):
    df_output = pd.read_excel(output_file)
    df_output.columns = df_output.columns.str.strip()
    df_verlage = pd.read_excel(verlage_file)
    df_verlage.columns = df_verlage.columns.str.strip()
    
    print("Spalten in output.xlsx:", df_output.columns)
    print("Spalten in Verlage.xlsx:", df_verlage.columns)
    
    if "Verlag" not in df_output.columns or "Verlag" not in df_verlage.columns:
        print("Fehler: Die Spalte 'Verlag' wurde nicht in einer der Dateien gefunden.")
        return
    
    verlage_list = df_verlage["Verlag"].dropna().unique().tolist()
    df_output = df_output[~df_output["Verlag"].isin(verlage_list)]
    
    df_output.to_excel(output_file, index=False)
    print("Gefilterte Datei wurde gespeichert.")

if __name__ == "__main__":
    process_excel("idn_mit_schlagwort.xlsx")