# Poprawianie danych książkowych przy użyciu lokalnego RDF (Wikidata)

W tym notebooku:
1. Pobierzemy przykładowe dane książkowe z Wikidata jako RDF.
2. Zamienimy je na plik Turtle (.ttl).
3. Załadujemy własne dane po OCR (np. naprawione przez T5/BART).
4. Poprawimy dane, korzystając z RDF i fuzzy-matching.


## Pobieranie przykładowych danych z Wikidata (SPARQL)

In [3]:
import requests
import pandas as pd
from rdflib import Graph
from collections import defaultdict
from rapidfuzz import process, fuzz
import matplotlib.pyplot as plt
import seaborn as sns
from difflib import SequenceMatcher

In [4]:
LANGS = ["pl", "en", "de", "fr", "es", "it", "cs", "ru", "hu"]

label_select = " ".join([f'?title_{lang}' for lang in LANGS])
label_opt = "\n".join([f'OPTIONAL {{ ?book rdfs:label ?title_{lang} . FILTER(LANG(?title_{lang})="{lang}") }}' for lang in LANGS])
author_opt = "\n".join([f'OPTIONAL {{ ?author rdfs:label ?authorLabel_{lang} . FILTER(LANG(?authorLabel_{lang})="{lang}") }}' for lang in LANGS])
publisher_opt = "\n".join([f'OPTIONAL {{ ?publisher rdfs:label ?publisherLabel_{lang} . FILTER(LANG(?publisherLabel_{lang})="{lang}") }}' for lang in LANGS])
genre_opt = "\n".join([f'OPTIONAL {{ ?genre rdfs:label ?genreLabel_{lang} . FILTER(LANG(?genreLabel_{lang})="{lang}") }}' for lang in LANGS])

SPARQL = f"""
SELECT DISTINCT ?book {label_select} ?author {" ".join([f"?authorLabel_{lang}" for lang in LANGS])} ?publisher {" ".join([f"?publisherLabel_{lang}" for lang in LANGS])} ?genre {" ".join([f"?genreLabel_{lang}" for lang in LANGS])} WHERE {{
  ?book wdt:P31 wd:Q571 .
  {label_opt}
  OPTIONAL {{ ?book wdt:P50 ?author . }}
  {author_opt}
  OPTIONAL {{ ?book wdt:P123 ?publisher . }}
  {publisher_opt}
  OPTIONAL {{ ?book wdt:P136 ?genre . }}
  {genre_opt}
}}
LIMIT 300
"""

url = "https://query.wikidata.org/sparql"
r = requests.get(url, params={"query": SPARQL, "format": "csv"})
with open("wikidata_books_multilang.csv", "w", encoding="utf-8") as f:
    f.write(r.text)
print("Zapisano wikidata_books_multilang.csv")

Zapisano wikidata_books_multilang.csv


## CSV → RDF

In [5]:
df = pd.read_csv("wikidata_books_multilang.csv")

def safe_literal(val):
    return str(val).replace('"', '\\"').replace('\\', '\\\\') if pd.notna(val) else None

with open("books_wikidata_multilang.ttl", "w", encoding="utf-8") as f:
    f.write('@prefix schema: <http://schema.org/> .\n')
    for idx, row in df.iterrows():
        uri = f"<{row['book']}>"
        f.write(f"{uri} a schema:Book ;\n")
        for lang in LANGS:
            if not pd.isna(row.get(f"title_{lang}", None)):
                f.write(f'  schema:name "{safe_literal(row[f"title_{lang}"])}"@{lang} ;\n')
        for lang in LANGS:
            if not pd.isna(row.get(f"authorLabel_{lang}", None)):
                f.write(f'  schema:author "{safe_literal(row[f"authorLabel_{lang}"])}"@{lang} ;\n')
        for lang in LANGS:
            if not pd.isna(row.get(f"publisherLabel_{lang}", None)):
                f.write(f'  schema:publisher "{safe_literal(row[f"publisherLabel_{lang}"])}"@{lang} ;\n')
        for lang in LANGS:
            if not pd.isna(row.get(f"genreLabel_{lang}", None)):
                f.write(f'  schema:genre "{safe_literal(row[f"genreLabel_{lang}"])}"@{lang} ;\n')
        f.write('.\n\n')
print("Zapisano books_wikidata_multilang.ttl")

Zapisano books_wikidata_multilang.ttl


## Ładowanie danych po T5/BART/OCR

In [6]:
path = "../test_results/books_corrected_bart.csv"
data = pd.read_csv(path)
display(data.head(3))

Unnamed: 0,title,authors,category,publisher
0,Goat Brothers,"By Colton, Larry","History , General",Doubleday
1,The Missing Person,"By Garmbach, Doris","Fiction , General",
2,,,,


## Parsowanie RDF

In [7]:
rdf_path = "books_wikidata_multilang.ttl"
g = Graph()
g.parse(rdf_path, format="ttl")

# Zbuduj: lang → lista (tytuł, autor, wydawca, gatunek)
kb = defaultdict(list)
for s in g.subjects(predicate=None, object=None):
    book_info = {"uri": s}
    for lang in LANGS:
        titles = list(g.objects(s, g.namespace_manager.expand("schema:name")))
        titles = [str(t) for t in titles if str(t).endswith(f'@{lang}')]
        if titles:
            book_info["title"] = titles[0][:-len(f'@{lang}')]

            authors = [str(a) for a in g.objects(s, g.namespace_manager.expand("schema:author")) if str(a).endswith(f'@{lang}')]
            publishers = [str(a) for a in g.objects(s, g.namespace_manager.expand("schema:publisher")) if str(a).endswith(f'@{lang}')]
            genres = [str(a) for a in g.objects(s, g.namespace_manager.expand("schema:genre")) if str(a).endswith(f'@{lang}')]
            book_info["author"] = authors[0][:-len(f'@{lang}')] if authors else None
            book_info["publisher"] = publishers[0][:-len(f'@{lang}')] if publishers else None
            book_info["genre"] = genres[0][:-len(f'@{lang}')] if genres else None
            kb[lang].append(book_info.copy())

print("Rekordy w bazie wiedzy (przykład PL):", len(kb["pl"]))
print(kb["pl"][0])


Rekordy w bazie wiedzy (przykład PL): 0


IndexError: list index out of range

## Dopasowanie fuzzy

In [8]:
def correct_with_kb_multilang(row, kb, langs=LANGS, score_cutoff=90):
    title = row["title"]
    results = []
    for lang in langs:
        kb_titles = [b["title"] for b in kb[lang] if b["title"]]
        if not kb_titles: continue
        match, score, idx = process.extractOne(title, kb_titles, scorer=fuzz.token_sort_ratio)
        if score >= score_cutoff:
            rec = kb[lang][idx]
            rec = rec.copy()
            rec["kb_lang"] = lang
            rec["kb_score"] = score
            results.append(rec)
    if results:
        # wybierz najlepsze dopasowanie
        best = max(results, key=lambda x: x["kb_score"])
        row["title_corrected"] = best["title"]
        row["authors_corrected"] = best.get("author", row.get("authors"))
        row["publisher_corrected"] = best.get("publisher", row.get("publisher"))
        row["category_corrected"] = best.get("genre", row.get("category"))
        row["kb_lang"] = best["kb_lang"]
        row["kb_score"] = best["kb_score"]
    else:
        row["title_corrected"] = row["title"]
        row["authors_corrected"] = row.get("authors")
        row["publisher_corrected"] = row.get("publisher")
        row["category_corrected"] = row.get("category")
        row["kb_lang"] = None
        row["kb_score"] = 0
    return row

data_fixed = data.apply(lambda row: correct_with_kb_multilang(row, kb), axis=1)
display(data_fixed.head(5))

KeyboardInterrupt: 

## Eksport

In [None]:
data_fixed.to_csv("final_corrected_books_multilang.csv", index=False)
print("✔️ Zapisano final_corrected_books_multilang.csv")

## Walidacja i analiza wyników

In [None]:
reference = None
try:
    reference = pd.read_csv("../test_results/books_reference.csv"
    data_fixed["title_true"] = reference["title"]
    data_fixed["authors_true"] = reference["authors"]
    data_fixed["publisher_true"] = reference["publisher"]
    data_fixed["category_true"] = reference["category"]
except Exception as e:
    print("Brak pliku original.csv lub niezgodność danych. Walidacja tylko na podstawie tytułów przed/po.")
def sim(a, b):
    if pd.isna(a) or pd.isna(b):
        return 0
    return SequenceMatcher(None, str(a), str(b)).ratio()

data_fixed["title_sim_before"] = data_fixed.apply(lambda r: sim(r["title"], r.get("title_true", "")), axis=1)
data_fixed["title_sim_after"]  = data_fixed.apply(lambda r: sim(r["title_corrected"], r.get("title_true", "")), axis=1)

plt.figure(figsize=(7,3))
sns.histplot(data_fixed["title_sim_before"], label="Przed RDF", color="orange", kde=True, stat='density')
sns.histplot(data_fixed["title_sim_after"], label="Po RDF", color="green", kde=True, stat='density')
plt.title("Podobieństwo tytułu do referencyjnego (przed/po)")
plt.xlabel("Levenshtein similarity [0–1]")
plt.legend(); plt.show()

# Ile rekordów się poprawiło?
improved = (data_fixed["title_sim_after"] > data_fixed["title_sim_before"]).sum()
no_change = (data_fixed["title_sim_after"] == data_fixed["title_sim_before"]).sum()
worse = (data_fixed["title_sim_after"] < data_fixed["title_sim_before"]).sum()

plt.figure(figsize=(4,2.5))
plt.bar(["Poprawione", "Bez zmiany", "Pogorszone"], [improved, no_change, worse], color=["green", "gray", "red"])
plt.ylabel("Liczba rekordów")
plt.title("Efekt korekty przez RDF")
plt.show()

# Podsumowanie po języku, w którym znaleziono dopasowanie
plt.figure(figsize=(8,3))
sns.countplot(data=data_fixed, x="kb_lang", order=LANGS)
plt.ylabel("Liczba rekordów")
plt.xlabel("Język rekordu w KB (RDF)")
plt.title("Dopasowania po językach")
plt.show()

# Wyświetl przykłady, które się poprawiły i pogorszyły
print("\nPRZYKŁADY: Poprawione tytuły (przed → po):\n")
improved_rows = data_fixed[data_fixed["title_sim_after"] > data_fixed["title_sim_before"]].head(3)
for _, r in improved_rows.iterrows():
    print("OCR/T5:", r["title"])
    print("Po RDF:", r["title_corrected"])
    print("Oryginał:", r.get("title_true", "???"))
    print("---")

print("\nPRZYKŁADY: Pogorszone tytuły (przed → po):\n")
worse_rows = data_fixed[data_fixed["title_sim_after"] < data_fixed["title_sim_before"]].head(3)
for _, r in worse_rows.iterrows():
    print("OCR/T5:", r["title"])
    print("Po RDF:", r["title_corrected"])
    print("Oryginał:", r.get("title_true", "???"))
    print("---")