In [3]:
import fitz, re, json, hashlib
from pathlib import Path
from rdflib import Graph, Namespace, Literal, RDF, URIRef
from rdflib.namespace import XSD

PDFS = [
  "Wettbewerb Saint-Louis-Park_Montauk_PLAN 2_A3.pdf",
  "Wettbewerb Saint-Louis-Park_Montauk_PLAN 1_A3.pdf",
  "Wettbewerb Saint-Louis-Park_Montauk_Nachweis Mengenangaben_Schema.pdf",
  "Wettbewerb Saint-Louis-Park_Montauk_Nachweis Flächenangaben Naturschutz_Schema.pdf",
  "Situation Schwamendingenstrasse Haltestelle Dorflinde.pdf",
  "Situation Querspange Polysteig bis Künstlergasse.pdf",
]

CATS = {
 "Hardscape":["Asphalt","Pflästerung","Chaussierung","Platten","Beton","Randstein","Bundstein","Bordstein","Belag","Stellplatte","Treppen","Mauer","Stampflehm","Naturstein"],
 "Vegetation":["Blumenrasen","Blumenwiese","Magerwiese","Krautsaum","Wildhecke","Laubbäume","Nadelbäume","Baum","Rasen","Ruderal","Trockenrasen"],
 "TechnicalInfrastructure":["Beleuchtung","Entwässerung","Leitung","Fernwärme","Hydrant","Kanal"],
 "UrbanFurniture":["Sitzbank","Bank","Infotafel","VKT","Velopfosten","Billett","Trinkbrunnen","Abfall","Geländer"],
 "Structure":["Steg","Pavillon","Turm","Sitzarena","Aussichtsturm","Plattform","Brücke","Zaun"],
 "Water":["Retentionsbecken","Wassertrog","temporäre Wasserflächen","Gewässer","Dachwasser"],
 "Recycling":["Recycling","Inventar","Granitblöcke","Kalksteinblöcke","Schienen","Prellbock"],
}
patterns = [(re.compile(rf"\b{re.escape(w)}\b", re.I), w, cat) for cat,words in CATS.items() for w in words]

def blocks(pdf):
    doc = fitz.open(pdf)
    for i,p in enumerate(doc, start=1):
        for (x0,y0,x1,y1,t, *_) in p.get_text("blocks"):
            if t and t.strip():
                yield dict(page=i,bbox=[x0,y0,x1,y1],text=t.strip())
    doc.close()

def unit(text):
    if re.search(r"\b(m2|m²)\b", re.I|re.M).search(text): return "M2"
    if re.search(r"\bStk\.?|Stück\b", re.I|re.M).search(text): return "C62"
    if re.search(r"\blfm\b", re.I|re.M).search(text): return "MTR"
    return None

def qty(text):
    m = re.search(r"(\d{1,3}(?:[.’']\d{3})*(?:[.,]\d+)?)(\s*)(m2|m²|Stk\.?|Stück|lfm)?", text, re.I)
    if not m: return None
    val = m.group(1).replace("’","").replace("'","").replace(".","").replace(" ","").replace(",", ".")
    try: return float(val)
    except: return None

# sammeln
recs=[]
for pdf in PDFS:
    p = Path(pdf)
    if not p.exists(): continue
    for b in blocks(str(p)):
        for pat,word,cat in patterns:
            if pat.search(b["text"]):
                recs.append({
                    "pdf": p.name, "page": b["page"], "bbox": [round(v,2) for v in b["bbox"]],
                    "name": word, "category": cat, "context": b["text"][:250],
                    "quantity": qty(b["text"]), "unit": unit(b["text"])
                })

# dedup
seen=set(); recs2=[]
for r in recs:
    k=(r["pdf"],r["page"],r["name"],r["context"])
    if k in seen: continue
    seen.add(k); recs2.append(r)

# export CSV/JSON
out = Path("out"); out.mkdir(exist_ok=True)
import pandas as pd
pd.DataFrame(recs2).to_csv(out/"materials_extracted.csv", index=False)
with open(out/"materials_extracted.json","w",encoding="utf-8") as f:
    json.dump(recs2,f,ensure_ascii=False,indent=2)

# RDF/Turtle
ZOE=Namespace("http://example.org/zoe#"); SCHEMA=Namespace("http://schema.org/"); UNIT=Namespace("http://qudt.org/vocab/unit/"); PROV=Namespace("http://www.w3.org/ns/prov#")
g=Graph(); [g.bind(pfx,ns) for pfx,ns in [("zoe",ZOE),("schema",SCHEMA),("unit",UNIT),("prov",PROV)]]

def urn(prefix,s): return URIRef(f"urn:{prefix}:{hashlib.sha1(s.encode()).hexdigest()[:12]}")
docs={}
for r in recs2:
    if r["pdf"] not in docs:
        du=urn("doc",r["pdf"]); docs[r["pdf"]]=du
        g.add((du,RDF.type,ZOE.DesignPlan)); g.add((du,SCHEMA.name,Literal(r["pdf"])))
cats={}
for r in recs2:
    if r["category"] not in cats:
        cu=URIRef(f"http://example.org/zoe#{r['category']}"); cats[r["category"]]=cu
        g.add((cu,RDF.type,ZOE.Kategorie)); g.add((cu,SCHEMA.name,Literal(r["category"])))
for r in recs2:
    mu=urn("mat", f"{r['pdf']}|{r['page']}|{r['name']}|{r['context'][:64]}")
    g.add((mu,RDF.type,ZOE.Material)); g.add((mu,SCHEMA.name,Literal(r["name"])))
    g.add((mu,ZOE.hatKategorie,cats[r["category"]]))
    g.add((mu,ZOE.aufSeite,Literal(int(r["page"]),datatype=XSD.integer)))
    g.add((mu,PROV.wasDerivedFrom,Literal(f"{r['pdf']} S.{r['page']}")))
    if r["quantity"] is not None: g.add((mu,ZOE.hatMenge,Literal(r["quantity"],datatype=XSD.decimal)))
    if r["unit"]=="M2": g.add((mu,ZOE.hatEinheit,UNIT.M2))
    elif r["unit"]=="MTR": g.add((mu,ZOE.hatEinheit,UNIT.M))
    elif r["unit"]: g.add((mu,ZOE.hatEinheit,Literal(r["unit"])))
g.serialize(out/"zoe_graph.ttl","turtle")

print("Fertig. Dateien unter ./out/")


ModuleNotFoundError: No module named 'rdflib'