In [1]:
# import
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# ML

### Loading clean and merged data

In [2]:
# load cleaned data = dataset_lowercase
dataset_lowercase = pd.read_csv('Cleaned_Book_ETL.csv', encoding='UTF-8', sep=',')


In [3]:
# Convert data types

column_types = {
    'User-ID': 'int64',
    'ISBN': 'string',
    'Book-Rating': 'int64',
    'Book-Title': 'string',
    'Book-Author': 'string',
    'Year-Of-Publication': 'int64',  # nebo 'string', pokud jsou hodnoty smíšené
    'Publisher': 'string',
    'Image-URL-S': 'string',
    'Image-URL-M': 'string',
    'Image-URL-L': 'string'
}

# Data types conversion
for column, dtype in column_types.items():
    dataset_lowercase[column] = dataset_lowercase[column].astype(dtype)

In [4]:
columns_to_drop = ['Image-URL-S', 'Image-URL-M']

# redundant columns
dataset_lowercase = dataset_lowercase.drop(columns=columns_to_drop)
dataset_lowercase

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,276726,0155061224,5,rites of passage,judith rae,2001,heinle,http://images.amazon.com/images/P/0155061224.0...
1,276729,052165615x,3,help!: level 1,philip prowse,1999,cambridge university press,http://images.amazon.com/images/P/052165615X.0...
2,276729,0521795028,6,the amsterdam connection : level 4 (cambridge ...,sue leather,2001,cambridge university press,http://images.amazon.com/images/P/0521795028.0...
3,276744,038550120x,7,a painted house,john grisham,2001,doubleday,http://images.amazon.com/images/P/038550120X.0...
4,276747,0060517794,9,little altars everywhere,rebecca wells,2003,harpertorch,http://images.amazon.com/images/P/0060517794.0...
...,...,...,...,...,...,...,...,...
383815,276704,0743211383,7,dreamcatcher,stephen king,2001,scribner,http://images.amazon.com/images/P/0743211383.0...
383816,276704,0806917695,5,perplexing lateral thinking puzzles: scholasti...,paul sloane,1997,sterling publishing,http://images.amazon.com/images/P/0806917695.0...
383817,276704,1563526298,9,get clark smart : the ultimate guide for the s...,clark howard,2000,longstreet press,http://images.amazon.com/images/P/1563526298.0...
383818,276709,0515107662,10,the sherbrooke bride (bride trilogy (paperback)),catherine coulter,1996,jove books,http://images.amazon.com/images/P/0515107662.0...


In [5]:
# Check for duplicates - one more time
print(dataset_lowercase.duplicated().sum())  # Počet duplicitních řádků

0


## Filter before ML ?

In [6]:
# FILTER data before pivot - Unnecessary - filter in APP

# Filter *: uživatelé, kteří hodnotili alespoň 2 knihy - není potřeba pro ML model důležitá

# Default Filter 0: uživatelé, kteří hodnotili alespoň 10 knih = 255000 záznamů
# Frontend Filter 1: determinative readers - uživatelé, kteří hodnotili knihu "The Fellowship of the Ring (The Lord of the Rings, Part 1)" od autora Tolkien = 144 záznamů příliš restriktivní

filter_df = dataset_lowercase.copy()

book_counts = filter_df.groupby('User-ID')['Book-Rating'].count()
# dataset_lowercase['User-ID'].loc[dataset_lowercase['User-ID'].isin(book_counts)]
book_counts = book_counts[book_counts > 10]

dataset_lowercase = filter_df.loc[filter_df['User-ID'].isin(book_counts.index)]


In [7]:
# SET test
chosen_book = 'the fellowship of the ring (the lord of the rings, part 1)'
chosen_book_readers = dataset_lowercase.loc[dataset_lowercase['Book-Title'] == chosen_book, 'User-ID']

books_of_chosen_readers = dataset_lowercase.loc[dataset_lowercase['User-ID'].isin(chosen_book_readers)]
books_of_chosen_readers.head()


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
2252,254,60502320,7,"i've got you, babe",karen kendall,2002,avon,http://images.amazon.com/images/P/0060502320.0...
2253,254,60934700,9,smoke and mirrors: short fictions and illusions,neil gaiman,2001,perennial,http://images.amazon.com/images/P/0060934700.0...
2254,254,60976977,7,amazing grace : lives of children and the cons...,jonathan kozol,1996,perennial,http://images.amazon.com/images/P/0060976977.0...
2255,254,64471047,7,"the lion, the witch, and the wardrobe (the chr...",c. s. lewis,1994,harpercollins,http://images.amazon.com/images/P/0064471047.0...
2256,254,66238501,5,complete chronicles of narnia,c. s. lewis,2001,harpercollins juvenile books,http://images.amazon.com/images/P/0066238501.0...


In [8]:
# pivot table
book_pivot = books_of_chosen_readers.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating') # pivot_table umí agregovat a umí pracovat s NaN
book_pivot.fillna(0, inplace=True) # Memory-efficient
book_pivot

User-ID,254,1674,11676,11944,16601,16795,22818,23571,23699,23872,...,254206,258614,259057,259901,260419,265313,271176,274393,276050,276313
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"earth prayers from around the world: 365 prayers, poems, and invocations for honoring the earth",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nonbook materials: the organization of integrated collections,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!yo!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'a hell of a place to lose a cow': an american hitchhiking odyssey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'eine gute frau hat keinen kopf'. europгѓ?г‚в¤ische sprichwгѓ?г‚в¶rter гѓ?г‚вјber frauen.,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zwгѓ?г‚в¶lf.,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
гѓ?ngeles fugaces (falling angels),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
гѓ?г‚?ber das fernsehen.,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
г‚вїeres tu mi mamгѓвў?/are you my mother?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# sparse matrix
book_sparse = csr_matrix(book_pivot)

In [10]:
# Unsupervised learner for implementing neighbor searches
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [11]:
# model vrací dvě hodnoty:
# 1) distances: Vzdálenosti mezi danou knihou (chosen_book) a jejími nejbližšími sousedy
# 2) suggestions: Indexy knih, které jsou nejbližší sousedé k dané knize (chosen_book)

# .values.reshape(1, -1): Převede řádek na numpy pole a přetvoří jej do tvaru (1, n), kde n je počet sloupců. To je nutné, protože kneighbors očekává 2D pole.
distances, suggestions = model.kneighbors(book_pivot.loc[[chosen_book]].values, n_neighbors=10) # n_neighbors=5, default
suggestions

array([[13378, 15544, 14924,  5740, 13746,  5748,  5744,  5745,  5750,
         5752]], dtype=int64)

In [12]:
# print all the suggested books
for i in range(len(suggestions)):
  print(book_pivot.index[suggestions[i]])

Index(['the fellowship of the ring (the lord of the rings, part 1)',
       'the two towers (the lord of the rings, part 2)',
       'the return of the king (the lord of the rings, part 3)',
       'harry potter and the chamber of secrets (book 2)',
       'the hobbit : the enchanting prelude to the lord of the rings',
       'harry potter and the prisoner of azkaban (book 3)',
       'harry potter and the goblet of fire (book 4)',
       'harry potter and the order of the phoenix (book 5)',
       'harry potter and the sorcerer's stone (book 1)',
       'harry potter and the sorcerer's stone (harry potter (paperback))'],
      dtype='string', name='Book-Title')
