In [1]:
# import
import pandas as pd
import numpy as np

from colorama import Fore, Style
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Dash framework a komponenty
import dash
from dash import html, dcc, Input, Output, State, dash_table
import dash_bootstrap_components as dbc
from dash.dash_table import DataTable
import base64
import re


In [2]:
# load cleaned data = dataset_lowercase
df = pd.read_csv('Cleaned_Book_ETL.csv', encoding='UTF-8', sep=',')


In [3]:
# Convert data types

column_types = {
    'User-ID': 'int64',
    'ISBN': 'string',
    'Book-Rating': 'int64',
    'Book-Title': 'string',
    'Book-Author': 'string',
    'Year-Of-Publication': 'int64',  # nebo 'string', pokud jsou hodnoty smíšené
    'Publisher': 'string',
    'Image-URL-S': 'string',
    'Image-URL-M': 'string',
    'Image-URL-L': 'string'
}

# Data types conversion
for column, dtype in column_types.items():
    df[column] = df[column].astype(dtype)

In [4]:
# VYNECHAT

# columns_to_drop = ['Image-URL-S', 'Image-URL-M']

# redundant columns
# df = df.drop(columns=columns_to_drop)
# df

In [5]:
# POZNÁMKY:
# Filter *: uživatelé, kteří hodnotili alespoň 2 knihy - není potřeba pro ML model důležitá


# Default Filter 0: uživatelé, kteří hodnotili alespoň 10 knih = 255000 záznamů
# Frontend Filter 1: determinative readers - uživatelé, kteří hodnotili knihu "The Fellowship of the Ring (The Lord of the Rings, Part 1)" od autora Tolkien = 144 záznamů příliš restriktivní
# 

In [6]:
# FILTER filter data before pivot - original data set je příliš velký pro pivot (40 s).

# Default Filter 0: uživatelé, kteří hodnotili alespoň 10 knih = 255000 záznamů
filter_df = df.copy()
book_counts = filter_df.groupby('User-ID')['Book-Rating'].count()
book_counts = book_counts[book_counts > 10]

df = filter_df.loc[filter_df['User-ID'].isin(book_counts.index)]


In [7]:
# VYNECHAT
# Check for duplicates - one more time
# print(df.duplicated().sum())  # Počet duplicitních řádků

In [8]:
# SET by USER - ZÁKLADNÍ LOGIKA původního PY.SOUBORU 
chosen_book = 'the fellowship  '
chosen_book = re.sub(r'\s+', ' ', chosen_book)

complete_name_of_chosen_book = df.loc[df['Book-Title'].str.contains(chosen_book, case=False, na=False, regex=False), 'Book-Title']
complete_name_of_chosen_book = complete_name_of_chosen_book.values[0]

chosen_book_author = df.loc[df['Book-Title'] == complete_name_of_chosen_book, 'Book-Author'].values[0]
books_of_chosen_readers = df.loc[df['Book-Author'] == chosen_book_author] # vracíme celý frame

In [9]:
# pivot table
book_pivot = books_of_chosen_readers.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating') # pivot_table umí agregovat a umí pracovat s NaN
book_pivot.fillna(0, inplace=True) # Memory-efficient
book_pivot

User-ID,254,643,882,1674,7864,11676,11944,12272,14079,14422,...,254206,254859,259901,260419,262391,265313,268030,274061,276487,276847
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a tolkien miscellany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arbol y hoja - tapa dura -,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
basil bunting and j.r.r. tolkien (author speaks),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"beowulf and the critics (medieval &amp; renaissance texts &amp; studies, vol. 248)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beowulf: the monster and the critics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
the two towers (lord of the rings (hardcover)),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the two towers (lord of the rings (paperback)),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the two towers (lord of the rings part 2),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"the two towers (the lord of the rings, part 2)",0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,...,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# sparse matrix
book_sparse = csr_matrix(book_pivot)

In [11]:
# Unsupervised learner for implementing neighbor searches
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [12]:
# model vrací dvě hodnoty:
# 1) distances: Vzdálenosti mezi danou knihou (chosen_book) a jejími nejbližšími sousedy
# 2) suggestions: Indexy knih, které jsou nejbližší sousedé k dané knize (chosen_book)

# .values.reshape(1, -1): Převede řádek na numpy pole a přetvoří jej do tvaru (1, n), kde n je počet sloupců. To je nutné, protože kneighbors očekává 2D pole.
distances, suggestions = model.kneighbors(book_pivot.loc[[complete_name_of_chosen_book]].values, n_neighbors=10) # n_neighbors=5, default
suggestions

array([[64, 87, 95, 73, 27,  3, 32, 33,  4, 34]], dtype=int64)

In [13]:
# print all the suggested books
for i in range(len(suggestions)):
  print(list(book_pivot.index[suggestions[i]]))

['the fellowship of the ring (the lord of the rings, part 1)', 'the return of the king (the lord of the rings, part 3)', 'the two towers (the lord of the rings, part 2)', 'the hobbit: or, there and back again', 'farmer giles of ham: aegidii ahenobarbi julii agricole de hammo domini de domito aule draconarie comitis regni minimi regis et basilei mira facinora', 'beowulf and the critics (medieval &amp; renaissance texts &amp; studies, vol. 248)', 'hobbit lord of the rings (coles notes)', 'j.r.r. tolkien of the darkening of valinor and of the flight of the noldor from the silmarillion read by christopher tolkien (cdl 51579)', 'beowulf: the monster and the critics', "j.r.r. tolkien's the fellowship of the ring"]


In [14]:
list_of_suggested_books = []

# Procházení návrhů a převod indexu na seznam řetězců
for i in range(len(suggestions)):
    recommended_books = book_pivot.index[suggestions[i]].tolist()  # Převod na seznam řetězců
    for book in recommended_books:
        list_of_suggested_books.append({"Suggested Books": book})

# Vytvoření tabulky
table = dash_table.DataTable(
    data=list_of_suggested_books,
    columns=[{"name": "Suggested Books", "id": "Suggested Books"}],
    style_table={"width": "100%"},
    style_cell={"textAlign": "left"}
)

table


DataTable(data=[{'Suggested Books': 'the fellowship of the ring (the lord of the rings, part 1)'}, {'Suggested Books': 'the return of the king (the lord of the rings, part 3)'}, {'Suggested Books': 'the two towers (the lord of the rings, part 2)'}, {'Suggested Books': 'the hobbit: or, there and back again'}, {'Suggested Books': 'farmer giles of ham: aegidii ahenobarbi julii agricole de hammo domini de domito aule draconarie comitis regni minimi regis et basilei mira facinora'}, {'Suggested Books': 'beowulf and the critics (medieval &amp; renaissance texts &amp; studies, vol. 248)'}, {'Suggested Books': 'hobbit lord of the rings (coles notes)'}, {'Suggested Books': 'j.r.r. tolkien of the darkening of valinor and of the flight of the noldor from the silmarillion read by christopher tolkien (cdl 51579)'}, {'Suggested Books': 'beowulf: the monster and the critics'}, {'Suggested Books': "j.r.r. tolkien's the fellowship of the ring"}], columns=[{'name': 'Suggested Books', 'id': 'Suggested Bo

In [15]:
# Loading a dash picture
def load_image(image_path):
    with open(image_path, "rb") as f:
        encoded_image = base64.b64encode(f.read()).decode()
    return f"data:image/jpeg;base64,{encoded_image}"

image_path = 'StuttgartSelect.jpg'

encoded_image = load_image(image_path)

In [16]:
# running Dash server

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# add Flask server for Gunicorn
server = app.server

In [17]:
# APP Layout 
app.layout = dbc.Container(
    
    # className="mb-4",
    children=[  
     
        # 1 - titulek
        dbc.Row(
            dbc.Col([
                html.H1(
                    "Book recomendation",
                    
                    style={"display": "flex",
                        "justify-content": "center",
                        "align-items": "center",
                        'color': 'grey'  # Barva textu
                            }
                ),
                html.Img(
                    src=encoded_image,
                    className="img-fluid",
                    style = {"objectFit": "cover"}
                )
            ]),
        

            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
                    },
            # className="mb-4",
        ),

        # 2 -  Search pro zadání názvu chosen_book 
        dbc.Row(
            dbc.Col([
                dcc.Input(
                    id="search_box",
                    type="text",
                    placeholder="Enter your favorite book to recommendations for others you might like",
                    debounce=True,  # Callback se spustí až po potvrzení Enter
                    style={"width": "50%"}
                ),
                dcc.Checklist(
                    id="author_filter",
                    options=[{"label": "Filter by same author", "value": "filter_author"}],
                    value=[],
                    style={}
                )],

            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
                    },
            # className="mb-4",
            )
        ),

        # 3 - Check box - autor, rok (state) kontejner pro zobrazení doporučení TOP 3 - 10 knih
        
            
        # 4 - kontejner pro zobrazení doporučení TOP 3 - 10 knih
        dbc.Row(    
            dbc.Col([
                html.Div(
                    id='output_table'
                    # výpis TOP doporučení
                ),
                html.Div(
                    id="error_output",  # ID kontejneru
                    style={
                        "color": "red",  # Červená barva pro chybové hlášky
                        "textAlign": "center",
                        "marginTop": "10px"
    }
)
    ]),
                style={"display": "flex",
                    "flex-wrap": "wrap",
                    "justify-content": "center",
                    "align-items": "center"
                }
        ), 
            
        # 5 - kontejner pro zobrazení obrázků doporučených knih 
        dbc.Row(
            dbc.Col(
                html.Div(
                    id="image_container",
                    style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    })
            ),
            style={
                    "display": "flex",
                    "justify-content": "center",
                    "align-items": "center",
                    "height": "100px"  # Nastavení pevné výšky
            }
        )
    ]
)

In [None]:
# Enter number of neighbors to ML
number_of_neighbors = 6 # the 6 - request = 5 result titles

# table for book or error content
def create_table(table_data, color_style, columns_style):
    return html.Div(dash_table.DataTable(
            data=table_data,
            columns=columns_style,
            style_table={"width": "50%"},
            style_cell={"textAlign": "left"},
            style_header={
                "textAlign": "center"
            }
                    ), 
        style={
            "display": "flex",
            "justify-content": "center", # jinak se nenačte stylování původního rozvrhu, které se hodí doleva
            "align-items": "center",
            **color_style
        }
            )

# book gallery content

# Funkce pro vytvoření kontejneru na obrázky knih
def create_image_container(image_urls):
    """
    Vytvoří HTML kontejner pro zobrazení obrázků knih.

    Parameters:
    - image_urls (list of str): Seznam URL adres obrázků.
    
    """
    # Zkontrolujeme, zda máme nějaké URL
    if image_urls == []:
        return html.Div("No images available.", style={"textAlign": "center", "color": "red", "marginTop": "10px"})

    # Vytvoříme seznam HTML <img> tagů pro každý obrázek
    image_elements = [
        html.Img(src=url, style={"width": "100px", "height": "150px", "margin": "10px", "display": "inline-flex",
            "justify-content": "center", # jinak se nenačte stylování původního rozvrhu, které se hodí doleva
            "align-items": "center"}) for url in image_urls
    ]

    return image_elements


# ----------------- Callback 1 recomendation -----------------
@app.callback(
    [Output("output_table", "children"),  # output table
    Output("image_container", "children")],  # gallerz
    [Input("search_box", "value"),  # Input book name
     State("author_filter", "value")]  # check box
)

def update_recommendations(chosen_book, filter_author):
    if not chosen_book:
        
        return [None, None] # 1. error message

    # Odstranění mezer ze vstupního názvu knihy
    chosen_book = re.sub(r'\s+', ' ', chosen_book)

    # ZÁKLADNÍ LOGIKA původního PY.SOUBORU 
    complete_name_of_the_book = df.loc[df['Book-Title'].str.contains(chosen_book, case=False, na=False, regex=False), 'Book-Title']
        
    # Ověření, že Series není prázdná
    if complete_name_of_the_book.empty:
        table_data = [{"Suggested Books:":"Book name is not in database."}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        return [create_table(table_data, color_style, columns_style), None] # 2. error message
    
    complete_name_of_the_book = complete_name_of_the_book.values[0] # Series
       
    # Podmínka filtrování podle autora - checkbox
    if filter_author:
        # Najdeme autora vybrané knihy
        chosen_book_author = df.loc[df['Book-Title'] == complete_name_of_the_book, 'Book-Author'].values[0]
        books_of_chosen_readers = df.loc[df['Book-Author'] == chosen_book_author] # vracíme celý frame
    # Podmínka filtrování bez autora knihy, na základě všech titulů čtenářů i jiných knih než knih autora zadaného názvu knihy
    else:
        readers_of_the_book = df.loc[df['Book-Title'] == complete_name_of_the_book, 'User-ID']
        books_of_chosen_readers = df.loc[df['User-ID'].isin(readers_of_the_book)] # vracíme celý frame

    # Pivotní tabulka
    book_pivot = books_of_chosen_readers.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating') # pivot_table umí agregovat a umí pracovat s NaN
    book_pivot.fillna(0, inplace=True) # Memory-efficient

    if book_pivot.empty or len(book_pivot) < number_of_neighbors:  # Pokud je méně než 'number_of_neighbors' knih v pivotní tabulce
        table_data = [{"Suggested Books:":"Not enough data for model to suggest a book"}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        return [create_table(table_data, color_style, columns_style), None] # 3. error message

    # Sparse matrix
    book_sparse = csr_matrix(book_pivot)

    # Unsupervised learner for implementing neighbor searches
    model = NearestNeighbors(algorithm='brute')
    model.fit(book_sparse)

    # model vrací dvě hodnoty:
    distances, suggestions = model.kneighbors(book_pivot.loc[[complete_name_of_the_book]].values, n_neighbors=number_of_neighbors) # n_neighbors=5, default

    # Sestavení seznamu doporučených knih
    list_of_suggested_books = []
    for i in range(len(suggestions)):
        recommended_books = book_pivot.index[suggestions[i]].tolist()  # Převod na seznam řetězců
        for book in recommended_books:
            list_of_suggested_books.append({"Suggested Books:": book})

    # Odstranění zadané knihy ze seznamu doporučení
    list_of_suggested_books = [book for book in list_of_suggested_books if book["Suggested Books:"] != complete_name_of_the_book] #Tento přístup projde každý slovník v seznamu list_of_suggested_books a ponechá jen ty, které nemají stejnou hodnotu
    
    # když je v seznamu kniha autora pouze jedna 
    if len(list_of_suggested_books) < 1:
        table_data = [{"Suggested Books:":f"Only avalaible result is the {complete_name_of_the_book}"}]
        color_style= {"color": "red"}
        columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
        list_of_titles = [x['Suggested Books:'] for x in list_of_suggested_books]
        image_urls_df = df.loc[df['Book-Title'] == complete_name_of_the_book, ['Book-Title', 'Image-URL-M']].groupby('Book-Title').max()
        image_urls_list = list(image_urls_df['Image-URL-M'].values)

        return [create_table(table_data, color_style, columns_style), create_image_container(image_urls_list)] # 4. error message
    
    # Vytvoření tabulky
    table_data = list_of_suggested_books
    color_style = {"color": "black"}
    columns_style = [{"name": "Suggested Books:", "id": "Suggested Books:"}]
    list_of_titles = [x['Suggested Books:'] for x in list_of_suggested_books]
    image_urls_df = df.loc[df['Book-Title'].isin(list_of_titles), ['Book-Title', 'Image-URL-M']].groupby('Book-Title').max()
    image_urls_list = list(image_urls_df['Image-URL-M'].values)
    
    print()
    print('list_of_suggested_books', list_of_suggested_books)
    print('image_urls_list', image_urls_list)
    print('list_of_titles', list_of_titles)
    
    return [create_table(table_data, color_style, columns_style), create_image_container(image_urls_list)]

# ----------------- Callback 2 gallery ----------------- 


if __name__ == "__main__":
    app.run_server(host="127.0.0.1", port=8060, debug=True)