## Importy, konfiguracja, wybór plików

In [13]:
import pandas as pd
import requests
from tqdm import tqdm
from rapidfuzz import fuzz
from difflib import SequenceMatcher
import matplotlib.pyplot as plt
import seaborn as sns

MODEL = "bart"  # albo "t5"
INPUT_PATH = f"../test_results/books_corrected_{MODEL}.csv"
REFERENCE_PATH = "../test_results/books_reference.csv"
OUTPUT_PATH = f"../test_results/books_semantic_corrected_{MODEL}.csv"


## Wczytaj wyniki modelu i dane referencyjne

In [14]:
df = pd.read_csv(INPUT_PATH)
df_ref = pd.read_csv(REFERENCE_PATH)
print(df.head(2))
print("Liczba rekordów:", len(df))


                title             authors            category  publisher
0       Goat Brothers    By Colton, Larry   History , General  Doubleday
1  The Missing Person  By Garmbach, Doris   Fiction , General        NaN
Liczba rekordów: 103063


## Funkcja pobierająca dane z API semantycznego internetu – z fuzzy matching
Tu dla przykładu OpenLibrary

In [22]:


def query_google_books_fuzzy(title, author=None, max_results=10, debug=False):
    """Wyszukuje książkę w Google Books, fuzzy-matchinguje po tytule i autorze."""
    url = "https://www.googleapis.com/books/v1/volumes"
    params = {"q": f'intitle:{title}'}
    if author:
        params["q"] += f"+inauthor:{author}"
    params["maxResults"] = max_results
    try:
        r = requests.get(url, params=params, timeout=10)
        if r.status_code == 200:
            data = r.json()
            items = data.get("items", [])
            best_result = None
            best_score = -1
            for item in items:
                info = item.get("volumeInfo", {})
                # Fuzzy matching na tytule
                score = fuzz.ratio(title.lower(), info.get("title", "").lower())
                if author and info.get("authors"):
                    score = 0.5 * score + 0.5 * fuzz.ratio(author.lower(), ", ".join(info.get("authors", [])).lower())
                # Bonus za obecność innych pól
                score += 10 * sum([
                    bool(info.get("authors")),
                    bool(info.get("publisher")),
                    bool(info.get("categories"))
                ])
                if debug:
                    print(f"Score: {score} | Tytuł: {info.get('title', '')} | Autor: {info.get('authors', '')}")
                if score > best_score:
                    best_score = score
                    best_result = info
            if best_result:
                return {
                    "title_sem": best_result.get("title", ""),
                    "authors_sem": ", ".join(best_result.get("authors", [])),
                    "publisher_sem": best_result.get("publisher", ""),
                    "category_sem": ", ".join(best_result.get("categories", [])) if "categories" in best_result else "",
                }
    except Exception as e:
        print("Błąd zapytania:", e)
    return {"title_sem": "", "authors_sem": "", "publisher_sem": "", "category_sem": ""}



print(query_google_books_fuzzy("Wiedźmn", "Andrej Sapkowski", debug=True))



{'title_sem': '', 'authors_sem': '', 'publisher_sem': '', 'category_sem': ''}


## Przetwarzanie całego pliku z postępem

In [11]:
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    res = query_openlibrary_fuzzy(row["title"], row.get("authors", None))
    combined = row.to_dict()
    combined.update(res)
    results.append(combined)
df_sem = pd.DataFrame(results)
df_sem.to_csv(OUTPUT_PATH, index=False)
print(f"✔ Wyniki zapisane do: {OUTPUT_PATH}")

  0%|          | 6/103063 [00:04<21:51:21,  1.31it/s]


KeyboardInterrupt: 

## Porównanie z plikiem referencyjnym + wizualizacja

In [None]:
def sim(a, b):
    return SequenceMatcher(None, str(a), str(b)).ratio()


for col in ["title", "authors", "publisher", "category"]:
    col_sem = col + "_sem"
    if col_sem in df_sem.columns and col in df_ref.columns:
        df_sem[f"{col}_similarity"] = [
            sim(a, b) for a, b in zip(df_sem[col_sem], df_ref[col])
        ]

for col in ["title", "authors", "publisher", "category"]:
    s = df_sem.get(f"{col}_similarity")
    if s is not None:
        print(f"\n{col} - mean similarity: {s.mean():.3f}")
        plt.figure(figsize=(7,2.5))
        sns.histplot(s, kde=True, stat='density')
        plt.title(f"Similarity for: {col}")
        plt.xlabel("Similarity (0–1)")
        plt.show()