In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# =========================
# CONFIG
# =========================
DATA_PATH = 'data/BD.json'
PAGE_SIZE = 10
SIM_WEIGHT = 0.7
RATING_WEIGHT = 0.3

# =========================
# 1. LOAD AND PREP DATA
# =========================
def load_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        df = pd.DataFrame(data)
    except FileNotFoundError:
        return None

    df.columns = df.columns.str.strip()

    for col in ['Title', 'Author', 'Genre', 'Description']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str).str.strip()

    df = df.drop_duplicates(subset=['Title', 'Author']).reset_index(drop=True)

    df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce').fillna(0)

    scaler = MinMaxScaler()
    df['Ratings_Norm'] = scaler.fit_transform(df[['Ratings']])

    df['Genre_List'] = df['Genre'].apply(
        lambda x: [g.strip().lower() for g in x.split(',')] if x else []
    )

    df['Title_LC'] = df['Title'].str.lower()
    df['Author_LC'] = df['Author'].str.lower()

    return df

# =========================
# PAGINATION
# =========================
def paginate_results(df, display_cols, page_size=PAGE_SIZE):
    total = len(df)
    if total == 0:
        print("No results found.")
        return

    page = 0
    max_page = (total - 1) // page_size

    while True:
        start = page * page_size
        end = start + page_size

        print(f"\nShowing results {start + 1}-{min(end, total)} of {total}")
        print(df.iloc[start:end][display_cols].to_string(index=False))

        cmd = input("\n[n] next | [p] previous | [q] quit: ").lower()

        if cmd == 'n' and page < max_page:
            page += 1
        elif cmd == 'p' and page > 0:
            page -= 1
        elif cmd == 'q':
            break
        else:
            print("Invalid option.")

# =========================
# 2. SEARCH & RECOMMENDER
# =========================
def run_flexible_recommender():
    df = load_data(DATA_PATH)
    if df is None:
        print("Dataset not found.")
        return

    print("--- Universal Book Search & Recommender ---")

    # TF-IDF PRECOMPUTE
    df['soup'] = (
        (df['Description'] + " ") * 2 +
        df['Genre'] + " " +
        df['Author']
    ).str.lower()

    tfidf = TfidfVectorizer(stop_words='english', max_df=0.85)
    tfidf_matrix = tfidf.fit_transform(df['soup'])

    while True:
        query = input("\nSearch Title / Author / Genre (or 'exit'): ").strip().lower()
        if query == 'exit':
            break
        if not query:
            continue

        title_matches = df[df['Title_LC'].str.contains(query)]['Title'].unique()
        author_matches = df[df['Author_LC'].str.contains(query)]['Author'].unique()

        all_genres = set(g for sub in df['Genre_List'] for g in sub)
        genre_matches = [g for g in all_genres if query in g]

        options = []
        for t in title_matches: options.append(('Book', t))
        for a in author_matches: options.append(('Author', a))
        for g in genre_matches: options.append(('Genre', g))

        if not options:
            print("No matches found.")
            continue

        print("\nMatches:")
        for i, (k, v) in enumerate(options):
            print(f"{i+1}. [{k}] {v}")

        sel = input("Select number or 0 to retry: ")
        if not sel.isdigit() or int(sel) == 0 or int(sel) > len(options):
            continue

        kind, selection = options[int(sel)-1]

        # =========================
        # GENRE SEARCH
        # =========================
        if kind == 'Genre':
            selection = selection.lower()
            results = (
                df[df['Genre_List'].apply(lambda x: selection in x)]
                .sort_values('Ratings', ascending=False)
            )

            print(f"\nBooks in genre '{selection}':")
            paginate_results(results, ['Title', 'Author', 'Ratings'])

        # =========================
        # AUTHOR SEARCH
        # =========================
        elif kind == 'Author':
            results = (
                df[df['Author_LC'] == selection.lower()]
                .sort_values('Ratings', ascending=False)
            )

            print(f"\nBooks by {selection}:")
            paginate_results(results, ['Title', 'Genre', 'Ratings'])

        # =========================
        # BOOK-BASED RECOMMENDATION
        # =========================
        elif kind == 'Book':
            book_idx = df[df['Title'] == selection].index[0]
            book_genres = set(df.loc[book_idx, 'Genre_List'])

            if df.loc[book_idx, 'Description'].strip() == '':
                print("Note: Limited description. Using genre & ratings more heavily.")

            print(f"\nSelected: {selection}")
            print("1. Similar Genre (multi-genre aware)")
            print("2. Same Author")
            print("3. Similar Plot & Style")

            tier = input("Choose (1-3): ")

            # ---- GENRE SIMILAR ----
            if tier == '1':
                df['genre_overlap'] = df['Genre_List'].apply(
                    lambda x: len(book_genres.intersection(set(x)))
                )

                results = (
                    df[(df['genre_overlap'] > 0) & (df.index != book_idx)]
                    .sort_values(['genre_overlap', 'Ratings'], ascending=False)
                )

                paginate_results(results, ['Title', 'Author', 'Ratings'])

            # ---- SAME AUTHOR ----
            elif tier == '2':
                results = (
                    df[
                        (df['Author_LC'] == df.loc[book_idx, 'Author_LC']) &
                        (df.index != book_idx)
                    ]
                    .sort_values('Ratings', ascending=False)
                )

                paginate_results(results, ['Title', 'Genre', 'Ratings'])

            # ---- PLOT & STYLE ----
            elif tier == '3':
                book_vec = tfidf_matrix[book_idx]
                df['similarity'] = cosine_similarity(book_vec, tfidf_matrix).flatten()

                threshold = df['similarity'].quantile(0.90)

                df['final_score'] = (
                    SIM_WEIGHT * df['similarity'] +
                    RATING_WEIGHT * df['Ratings_Norm']
                )

                results = (
                    df[
                        (df.index != book_idx) &
                        (df['similarity'] >= threshold)
                    ]
                    .sort_values('final_score', ascending=False)
                )

                paginate_results(results, ['Title', 'Author', 'Ratings'])

# =========================
# RUN
# =========================
run_flexible_recommender()