In [1]:
# import
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Dash framework a komponenty
import dash
from dash import html, dcc, Input, Output, State, dash_table
import dash_bootstrap_components as dbc
from dash.dash_table import DataTable
import base64
import re

# matching the finding titles
from rapidfuzz import fuzz, process


# App test preparation

## Load the data 

In [2]:
# load cleaned data = dataset_lowercase
df_raw = pd.read_csv('Cleaned_Book_ETL.csv', encoding='UTF-8', sep=',')


In [3]:
# Convert data types

column_types = {
    'User-ID': 'int64',
    'ISBN': 'string',
    'Book-Rating': 'int64',
    'Book-Title': 'string',
    'Book-Author': 'string',
    'Year-Of-Publication': 'int64',  # nebo 'string', pokud jsou hodnoty smíšené
    'Publisher': 'string',
    'Image-URL-S': 'string',
    'Image-URL-M': 'string',
    'Image-URL-L': 'string'
}

# Data types conversion
for column, dtype in column_types.items():
    df_raw[column] = df_raw[column].astype(dtype)

In [4]:
# Drop the columns?

# columns_to_drop = ['Image-URL-S', 'Image-URL-M']
# redundant columns
# df_raw = df_raw.drop(columns=columns_to_drop)
# df_raw

In [5]:
# Notes:
# Filter *: uživatelé, kteří hodnotili alespoň 2 knihy - není na rozdíl od corr potřeba pro ML důležitá
# Default Filter 0: uživatelé, kteří hodnotili alespoň 10 knih = 255000 záznamů
# Frontend Filter 1: determinative readers

## Filtering before pivot

In [6]:
# FILTER filter data before pivot - original data set too big to pivot (40 s).

# 1) Default Filter 0: min 10 count of ratings
# set the Filter book_rating_counts = ?

filter_0_number = 5

filter_df = df_raw.copy()
book_rating_counts = filter_df.groupby('User-ID')['Book-Rating'].count()
# book_rating_counts_2 = filter_df[['Book-Rating', 'User-ID']].groupby('User-ID').count() # df se stejným výsledkem
book_rating_counts = book_rating_counts[book_rating_counts > filter_0_number] # chceme Seires pro filtrování

df = filter_df.loc[filter_df['User-ID'].isin(book_rating_counts.index)] # celý df autorů, kteří jsou na indexu hodnocení > filter_0_number
# df.info() # base frame Default Filter 0 applicated

print(len(book_rating_counts.index), f'autorů, kteří jsou na indexu hodnocení > {filter_0_number}')

10784 autorů, kteří jsou na indexu hodnocení > 5


In [7]:
# 2) Frontend Filter 1: determinative readers
# What can be SET by USER - Main logic of book_rec.py 

chosen_book = "  Sorcerer's  " 

# debug Top Ten
# --------------------------------------
# # To Kill a Mockingbird
# The Great Gatsby
# 1984
# The Catcher in the Rye
# Pride and Prejudice
# The Alchemist
# Harry Potter and the Sorcerer's Stone
# The Hobbit
# The Book Thief # no data in dataset
# Where the Crawdads Sing # no data in dataset

chosen_book = re.sub(r'\s+', ' ', chosen_book).strip() # 

complete_name_of_chosen_books = df.loc[df['Book-Title'].str.contains(chosen_book, case=False, na=False, regex=False), 'Book-Title'].unique() #potřeujeme unique jinak se vrátí stejné názvy v různých hodnoceních
list_of_complete_name_of_chosen_books = list(complete_name_of_chosen_books)

print(f'number of occurrences of the relative titles in the list: {len(list_of_complete_name_of_chosen_books)}')

number of occurrences of the relative titles in the list: 22


In [8]:
# need a best match (match, score, index)
test_best_match = process.extractOne(chosen_book, list_of_complete_name_of_chosen_books)
print(test_best_match[0], type(test_best_match), round(test_best_match[1],2),'%')

name_of_the_chosen_book = test_best_match[0]

harry potter and the sorcerer's stone (harry potter (paperback)) <class 'tuple'> 81.0 %


In [9]:
readers_of_the_chosen_book = df.loc[df['Book-Title'] == name_of_the_chosen_book, 'User-ID']
readers_of_the_chosen_book # 127 čtenářů, kteří hodnotli více jak filter_0_number a četli knihu

titles_of_the_readers_of_the_chosen_book = df.loc[df['User-ID'].isin(readers_of_the_chosen_book)] # 22348 knih čtenářů, kteří jsou v seznamu, kde všichni četli vybranou knihu a hodnotili více jak 10 x 
# titles_of_the_readers_of_the_chosen_book.info()

print(len(titles_of_the_readers_of_the_chosen_book), f'knih od všech autorů, kteří jsou na indexu hodnocení > {filter_0_number} a četli knihu')

24278 knih od všech autorů, kteří jsou na indexu hodnocení > 5 a četli knihu


## Pivot table

In [10]:
# pivot table
test_book_pivot = titles_of_the_readers_of_the_chosen_book.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating') # pivot_table umí agregovat a umí pracovat s NaN
test_book_pivot.fillna(0, inplace=True) # Memory-efficient
test_book_pivot

User-ID,1733,2110,3418,5207,6563,7072,7809,9747,9908,10560,...,267830,268262,269308,269566,270326,270820,271705,272786,274380,278843
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"earth prayers from around the world: 365 prayers, poems, and invocations for honoring the earth",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
final fantasy anthology: official strategy guide (brady games),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
garfield bigger and better (garfield (numbered paperback)),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!%@ (a nutshell handbook),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'a hell of a place to lose a cow': an american hitchhiking odyssey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zukunftsmarkt business on demand. unternehmenserfolg durch stгѓ?г‚в¤ndige verfгѓ?г‚вјgbarkeit.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zwгѓ?г‚в¶lf.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
гѓ?г‚?ber das fernsehen.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# sparse matrix
book_sparse = csr_matrix(test_book_pivot)

## NearestNeighbors

In [12]:
# Unsupervised learner for implementing neighbor searches
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [21]:
# model vrací dvě hodnoty:
# 1) distances: Vzdálenosti mezi danou knihou (chosen_book) a jejími nejbližšími sousedy
# 2) suggestions: Indexy knih, které jsou nejbližší sousedé k dané knize (chosen_book) jako pole

test_distances, test_suggestions = model.kneighbors(test_book_pivot.loc[[name_of_the_chosen_book]].values, n_neighbors=10) # n_neighbors=5, default
test_suggestions

array([[ 6082,  6073,  6079,  6075,  6076,  2210, 14086, 14114, 13609,
          630]], dtype=int64)

In [14]:
# print all the suggested books
for i in range(len(test_suggestions)):
  print((test_book_pivot.index[test_suggestions[i]].values))

<StringArray>
["harry potter and the sorcerer's stone (harry potter (paperback))",
                 'harry potter and the chamber of secrets (book 2)',
                'harry potter and the prisoner of azkaban (book 3)',
                     'harry potter and the goblet of fire (book 4)',
               'harry potter and the order of the phoenix (book 5)',
                                            "bridget jones's diary",
       'the fellowship of the ring (the lord of the rings, part 1)',
                                                         'the firm',
                                                       'the client',
                                                   'a time to kill']
Length: 10, dtype: string


### Test_table of recomendation

In [15]:
test_list_of_suggested_books = []

# Procházení návrhů a převod indexu na seznam řetězců
for i in range(len(test_suggestions)):
    recommended_books = test_book_pivot.index[test_suggestions[i]].tolist()  # Převod na seznam řetězců
    for book in recommended_books:
        test_list_of_suggested_books.append({"Suggested Books": book})

# Vytvoření tabulky
test_table = dash_table.DataTable(
    data=test_list_of_suggested_books,
    columns=[{"name": "Suggested Books", "id": "Suggested Books"}],
    style_table={"width": "100%"},
    style_cell={"textAlign": "left"}
)

test_table


DataTable(data=[{'Suggested Books': "harry potter and the sorcerer's stone (harry potter (paperback))"}, {'Suggested Books': 'harry potter and the chamber of secrets (book 2)'}, {'Suggested Books': 'harry potter and the prisoner of azkaban (book 3)'}, {'Suggested Books': 'harry potter and the goblet of fire (book 4)'}, {'Suggested Books': 'harry potter and the order of the phoenix (book 5)'}, {'Suggested Books': "bridget jones's diary"}, {'Suggested Books': 'the fellowship of the ring (the lord of the rings, part 1)'}, {'Suggested Books': 'the firm'}, {'Suggested Books': 'the client'}, {'Suggested Books': 'a time to kill'}], columns=[{'name': 'Suggested Books', 'id': 'Suggested Books'}], style_table={'width': '100%'}, style_cell={'textAlign': 'left'})

# Dash Aplication

In [16]:
# Loading a dash picture
def load_image(image_path):
    with open(image_path, "rb") as f:
        encoded_image = base64.b64encode(f.read()).decode()
    return f"data:image/jpeg;base64,{encoded_image}"

image_path = 'StuttgartSelect.jpg'

encoded_image = load_image(image_path)

In [17]:
# running Dash server

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# add Flask server for Gunicorn
server = app.server

In [18]:
# APP Layout 
app.layout = dbc.Container(
    
    # className="mb-4",
    children=[  
     
        # 1 - titulek
        dbc.Row(
            dbc.Col([
                html.H1(
                    "Book recomendation",
                    
                    style={"display": "flex",
                        "justify-content": "center",
                        "align-items": "center",
                        'color': 'grey'  # Barva textu
                            }
                ),
                html.Img(
                    src=encoded_image,
                    className="img-fluid",
                    style = {"objectFit": "cover"}
                )
            ]),
        

            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
                    },
            # className="mb-4",
        ),

        # 2 -  Search pro zadání názvu chosen_book 
        dbc.Row(
            dbc.Col([
                dcc.Input(
                    id="search_box",
                    type="text",
                    placeholder="Enter your favorite book to recommendations for others you might like",
                    debounce=True,  # Callback se spustí až po potvrzení Enter
                    style={"width": "50%"}
                ),
                dcc.Checklist(
                    id="author_filter",
                    options=[{"label": "Filter by same author", "value": "filter_author"}],
                    value=[],
                    style={}
                )],

            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
                    },
            # className="mb-4",
            )
        ),

        # 3 - Check box - autor, rok (state) kontejner pro zobrazení doporučení TOP 3 - 10 knih
        
            
        # 4 - kontejner pro zobrazení doporučení TOP 3 - 10 knih
        dbc.Row(    
            dbc.Col([
                html.Div(
                    id='output_table'
                    # výpis TOP doporučení
                ),
                html.Div(
                    id="error_output",  # ID kontejneru
                    style={
                        "color": "red",  # Červená barva pro chybové hlášky
                        "textAlign": "center",
                        "marginTop": "10px"
    }
)
    ]),
                style={"display": "flex",
                    "flex-wrap": "wrap",
                    "justify-content": "center",
                    "align-items": "center"
                }
        ), 
            
        # 5 - kontejner pro zobrazení obrázků doporučených knih 
        dbc.Row(
            dbc.Col(
                html.Div(
                    id="image_container",
                    style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    })
            ),
            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
            }
        )
    ]
)

In [19]:
# Enter number of neighbors to ML
number_of_neighbors = 10 # the 6 - request = 5 result titles

# -------------------------- Table for book or error content --------------------------
def create_table(table_data, color_style, columns_style):
    
    return html.Div(dash_table.DataTable(
            data=table_data,
            columns=columns_style,
            style_table={"width": "50%"},
            style_cell={"textAlign": "left"},
            style_header={
                "textAlign": "center"
            }
                    ), 
        style={
            "display": "flex",
            "justify-content": "center", # jinak se nenačte stylování původního rozvrhu, které se hodí doleva
            "align-items": "center",
            **color_style
        }
            )

# -------------------------- Book gallery content --------------------------

# Funkce pro vytvoření kontejneru na obrázky knih
def create_image_container(image_urls):
    
    # Zkontrolujeme, zda máme nějaké URL (vrátíme Div s hláškou)
    if image_urls == []:
        return html.Div("No images available.", style={"textAlign": "center", "color": "red", "marginTop": "10px"})

    # Vytvoříme seznam HTML <img> tagů pro každý obrázek
    image_elements = [
        html.Img(src=url, style={"width": "100px", "height": "150px", "margin": "10px", "display": "inline-flex",
            "justify-content": "center", # jinak se nenačte stylování původního rozvrhu, které se hodí doleva
            "align-items": "center"}) for url in image_urls
    ]

    return image_elements


# -------------------------- Callback (recomendation) --------------------------
@app.callback(
    [Output("output_table", "children"),  # output table
    Output("image_container", "children")],  # gallery
    [Input("search_box", "value"),  # Input book name
     State("author_filter", "value")]  # check box
)

def update_recommendations(chosen_book, filter_author):
    
    # 1. check no input chosen_book
    if not chosen_book:

        return [None, None] # 1. initial non error message

    # Need a best match
    chosen_book = re.sub(r'\s+', ' ', chosen_book)
    name_of_the_book = df_raw.loc[df_raw['Book-Title'].str.contains(chosen_book, case=False, na=False, regex=False), 'Book-Title'] # Series
    list_of_complete_name_of_chosen_books = list(complete_name_of_chosen_books)
    best_match = process.extractOne(chosen_book, list_of_complete_name_of_chosen_books) # tuple
    name_of_the_book = best_match[0] # str

    # 2. Tuple best_match emptiness check
    if best_match == ():
        
        table_data = [{"Suggested Books:":"Book name is not in database."}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        return [create_table(table_data, color_style, columns_style), None] # 2. error message
       
    # 3. a) Check if an author is selected 
    if filter_author:
        
        author_of_the_chosen_book = df_raw.loc[df_raw['Book-Title'] == name_of_the_book, 'Book-Author'].values[0] # autor vybrané knihy, 'přesná shoda'
        books_of_other_readers = df_raw.loc[df_raw['Book-Author'] == author_of_the_chosen_book] # vracíme frame = pouze autorovi knihy
    
    else: # b) no author selected 
        
        readers_of_the_book = df_raw.loc[df_raw['Book-Title'] == name_of_the_book, 'User-ID']
        books_of_other_readers = df_raw.loc[df_raw['User-ID'].isin(readers_of_the_book)] # vracíme 'celý' frame = všech titulů čtenářů i jiných knih než knih autora zadaného názvu knihy

    # Pivot table
    book_pivot = books_of_other_readers.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating') # pivot_table umí agregovat a umí pracovat s NaN
    book_pivot.fillna(0, inplace=True) # Memory-efficient

    if book_pivot.empty or len(book_pivot) < number_of_neighbors:  # pokud je méně než 'number_of_neighbors' knih v pivotní tabulce
        table_data = [{"Suggested Books:":"Not enough data for model to suggest a book"}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        return [create_table(table_data, color_style, columns_style), None] # 3. error message

    # Sparse matrix
    book_sparse = csr_matrix(book_pivot)

    # Unsupervised learner for implementing neighbor searches
    model = NearestNeighbors(algorithm='brute')
    model.fit(book_sparse)

    # Suggestions indexes for book_pivot recomendation:
    distances, suggestions = model.kneighbors(book_pivot.loc[[name_of_the_book]].values, n_neighbors=number_of_neighbors) # n_neighbors=5, default

    # Making the list of suggested books
    list_of_suggested_books = []
    for i in range(len(suggestions)):
        recommended_books = book_pivot.index[suggestions[i]].tolist()  # převod na seznam řetězců
        for book in recommended_books:
            list_of_suggested_books.append({"Suggested Books:": book})

    # Remove of name_of_the_book from the list
    list_of_suggested_books = [book for book in list_of_suggested_books if book["Suggested Books:"] != name_of_the_book] #Tento přístup projde každý slovník v seznamu list_of_suggested_books a ponechá jen ty, které nemají stejnou hodnotu
    
    # 4. Only one book result check
    if len(list_of_suggested_books) < 1:
        table_data = [{"Suggested Books:":f"Only avalaible result is the {name_of_the_book}"}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        
        image_urls_df_raw = df_raw.loc[df_raw['Book-Title'] == name_of_the_book, ['Book-Title', 'Image-URL-M']].groupby('Book-Title').max() # jedna knížka bude mít více hodnocení -> html odkazů
        image_urls_list = list(image_urls_df_raw['Image-URL-M'].values)

        return [create_table(table_data, color_style, columns_style), create_image_container(image_urls_list)] # 4. error message
    
    # Vytvoření tabulky
    table_data = list_of_suggested_books
    color_style = {"color": "black"}
    columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
    list_of_titles = [x['Suggested Books:'] for x in list_of_suggested_books]
    image_urls_df_raw = df_raw.loc[df_raw['Book-Title'].isin(list_of_titles), ['Book-Title', 'Image-URL-M']].groupby('Book-Title').max()
    image_urls_list = list(image_urls_df_raw['Image-URL-M'].values)
    
    # debug
    # print()
    # print('list_of_suggested_books', list_of_suggested_books)
    # print('image_urls_list', image_urls_list)
    # print('list_of_titles', list_of_titles)
    
    return [create_table(table_data, color_style, columns_style), create_image_container(image_urls_list)]

# ----------------- Dash server ----------------- 

if __name__ == "__main__":
    app.run_server(host="127.0.0.1", port=8060, debug=True)