<a href="https://colab.research.google.com/github/akselea/Book-Recommendation-System-ML/blob/main/Book-Recommendation-System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Submission Dicoding - Sistem Rekomendasi
##### Nama: Aksel Estevannanda Arianto
##### Dataset diambil dari Kaggle 
###### **Book Recomendation Dataset** - https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata


### Memasukkan *Library* yang Digunakan:

In [1]:
# Memasukan library yang dibutuhkan dalam membangun model
import pathlib
import os
from google.colab import files

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

### Mengunduh *Dataset* dari Situs Kaggle:

In [2]:
# Install terlebih dahulu Library untuk mengakses Kaggle
! pip install -q kaggle

In [3]:
# Memasukkan API Token Kaggle
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"akselestevannandaa","key":"72167085ecc204832f209a6ca813368d"}'}

In [4]:
# Membuat direktori untuk menyimpan dataset dari Kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [5]:
# Memberi instruksi agar file yang diunduh dapat dimodifikasi hanya oleh yang memiliki API Token saja
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
# Mengunduh dataset dari Kaggle sesuai dengan URL yang telah dimasukkan
! kaggle datasets download dylanjcastillo/7k-books-with-metadata -p /content/dataset/ --unzip  

Downloading 7k-books-with-metadata.zip to /content/dataset
 68% 1.00M/1.47M [00:00<00:00, 1.99MB/s]
100% 1.47M/1.47M [00:00<00:00, 2.70MB/s]


In [7]:
# Memetakan dataset yang telah diunduh
books = pd.read_csv('/content/dataset/books.csv')

print("Banyak Buku dalam Dataset:", len(books['isbn13'].unique()))

Banyak Buku dalam Dataset: 6810


### *Data Preprocessing*:

In [8]:
# Mencari info terkait dataset "books"
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [9]:
# Menghapus beberapa kolom yang tidak memiliki kaitan dengan sistem rekomendasi buku
books = books.drop(['subtitle', 'description', 'isbn10', 'num_pages', 'ratings_count', 'thumbnail'], axis=1)
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   title           6810 non-null   object 
 2   authors         6738 non-null   object 
 3   categories      6711 non-null   object 
 4   published_year  6804 non-null   float64
 5   average_rating  6767 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 319.3+ KB


In [10]:
# Mengecek jumlah data yang kosong pada setiap kolom pada dataset
books.isnull().sum()

isbn13             0
title              0
authors           72
categories        99
published_year     6
average_rating    43
dtype: int64

In [11]:
# Menghapus data yang kosong pada dataset, dan mengganti jenis data pada kolom "published_year" menjadi int
books = books.dropna()
books['published_year'] = books['published_year'].astype(int)
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6599 entries, 0 to 6809
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6599 non-null   int64  
 1   title           6599 non-null   object 
 2   authors         6599 non-null   object 
 3   categories      6599 non-null   object 
 4   published_year  6599 non-null   int64  
 5   average_rating  6599 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 360.9+ KB


### *Data Preparation*:

In [12]:
# Membuat dataset baru yang akan digunakan untuk model
df = books
df.head()

Unnamed: 0,isbn13,title,authors,categories,published_year,average_rating
0,9780002005883,Gilead,Marilynne Robinson,Fiction,2004,3.85
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,2000,3.83
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,1982,3.97
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,1993,3.93
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,2002,4.15


In [13]:
# Melihat penyebaran data untuk kolom "categories"
df['categories'].value_counts()

Fiction                      2561
Juvenile Fiction              524
Biography & Autobiography     398
History                       261
Literary Criticism            165
                             ... 
Bombings                        1
India                           1
Murder                          1
Cocaine industry                1
Mysticism                       1
Name: categories, Length: 563, dtype: int64

In [14]:
# Melihat banyaknya kategori buku yang sangat banyak, maka kategori dibawah 10 buku pada dataset ini akan dihapus
value_counts = df['categories'].value_counts()
to_keep = value_counts[value_counts >= 10].index
df = df.loc[df['categories'].isin(to_keep)]
df['categories'].value_counts()

Fiction                          2561
Juvenile Fiction                  524
Biography & Autobiography         398
History                           261
Literary Criticism                165
Philosophy                        160
Comics & Graphic Novels           144
Religion                          137
Drama                             126
Juvenile Nonfiction               111
Poetry                             77
Literary Collections               71
Science                            71
Business & Economics               66
Social Science                     59
Performing Arts                    50
Art                                46
Cooking                            45
Body, Mind & Spirit                44
Travel                             43
Computers                          42
Psychology                         41
Self-Help                          38
Political Science                  35
Family & Relationships             34
Humor                              32
Health & Fit

In [15]:
# Melihat penyebaran data pada kolom "title"
df['title'].value_counts()

Three Complete Novels        8
Star Wars                    7
The Lord of the Rings        6
The Canterbury Tales         4
Cuba                         4
                            ..
Traveling Mercies            1
Sailing the Wine-Dark Sea    1
The Code Book                1
The Hot Zone                 1
'I'm Telling You Stories'    1
Name: title, Length: 5505, dtype: int64

In [16]:
# Pada kolom data Judul Buku, tanda baca akan dihilangkan dan diganti dengan whitespace
# Buku yang memiliki jumlah lebih dari satu, akan dihapus duplikasinya dari dataset
df['title'] = df['title'].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df = df.drop_duplicates('title')
df['title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(lambda x: re.sub("[\W_]+"," ",x).strip())


Gilead                              1
Killer 7 Official Strategy Guide    1
Political Philosophy                1
Sociology                           1
Conversations with Zizek            1
                                   ..
The Book of Ruth                    1
Children of the Alley               1
The Beginning and the End           1
Lost in the Funhouse                1
I m Telling You Stories             1
Name: title, Length: 5500, dtype: int64

In [17]:
# Mereset index pada dataframe "df" agar judul buku dapat terbaca oleh model
df.reset_index(inplace=True, drop=True)

In [18]:
# Membuat list data untuk judul, kategori, dan nilai dari buku untuk dibuat menjadi dataframe baru
books_title = df['title'].tolist()
books_category = df['categories'].tolist()
books_rating = df['average_rating'].tolist()

In [19]:
# Membuat Dictionary dari data judul, kategori, dan nilai buku
rating_df = pd.DataFrame({'title':books_title,
                          'category':books_category,
                          'rating':books_rating})
rating_df

Unnamed: 0,title,category,rating
0,Gilead,Fiction,3.85
1,Spider s Web,Detective and mystery stories,3.83
2,The One Tree,American fiction,3.97
3,Rage of angels,Fiction,3.93
4,Master of the Game,Adventure stories,4.11
...,...,...,...
5495,Journey to the East,Adventure stories,3.70
5496,The Monk Who Sold His Ferrari A Fable About Fu...,Health & Fitness,3.82
5497,I Am that,Philosophy,4.51
5498,The Berlin Phenomenology,History,0.00


In [20]:
# Dari dataframe yang telah dibuat, terdapat buku dengan nilai = 0,
# maka dari itu buku yang memiliki nilai = 0 akan dihapus dari dataset
rating_df = rating_df[rating_df['rating'] != 0]
rating_df = rating_df.reset_index()
rating_df

Unnamed: 0,index,title,category,rating
0,0,Gilead,Fiction,3.85
1,1,Spider s Web,Detective and mystery stories,3.83
2,2,The One Tree,American fiction,3.97
3,3,Rage of angels,Fiction,3.93
4,4,Master of the Game,Adventure stories,4.11
...,...,...,...,...
5487,5494,Aspects of the Novel,English fiction,3.83
5488,5495,Journey to the East,Adventure stories,3.70
5489,5496,The Monk Who Sold His Ferrari A Fable About Fu...,Health & Fitness,3.82
5490,5497,I Am that,Philosophy,4.51


### *Model Development* - *Content Based Filtering*:

In [21]:
# Membuat dataset baru dengan nama "data", serta mengambil 10 sampel random dari dataset "data"
data = rating_df
data.sample(10)

Unnamed: 0,index,title,category,rating
2956,2957,Doctor Who,Performing Arts,4.21
4293,4297,See how She Dies,Fiction,3.92
2213,2214,Midnight for Charlie Bone,Juvenile Fiction,3.81
4306,4310,CliffsNotes on Wiesel s Night,Literary Criticism,3.4
4914,4918,Season of Mists,Comics & Graphic Novels,4.54
3970,3973,The Reign of Istar,Fiction,3.71
178,178,The Lust Lizard of Melancholy Cove,Fiction,3.86
1881,1882,History of Philosophy,Philosophy,4.08
3244,3245,Running Dog,Fiction,3.42
4016,4019,The Complete Short Prose 1929 1989,Fiction,4.29


In [22]:
# Menggunakan library `TfidfVectorizer()` untuk membaca jenis dari kategori buku yang ada pada dataset
tf = TfidfVectorizer()
tf.fit(data['category'])
tf.get_feature_names()



['adventure',
 'american',
 'and',
 'architecture',
 'art',
 'arts',
 'authors',
 'autobiography',
 'biography',
 'body',
 'business',
 'children',
 'collections',
 'comics',
 'computers',
 'cooking',
 'crime',
 'criticism',
 'detective',
 'disciplines',
 'drama',
 'economics',
 'education',
 'english',
 'family',
 'fantasy',
 'fiction',
 'fitness',
 'foreign',
 'games',
 'graphic',
 'health',
 'help',
 'history',
 'humor',
 'juvenile',
 'language',
 'law',
 'literary',
 'medical',
 'mind',
 'music',
 'mystery',
 'nature',
 'nonfiction',
 'novels',
 'performing',
 'philosophy',
 'photography',
 'poetry',
 'political',
 'psychology',
 'recreation',
 'reference',
 'relationships',
 'religion',
 'science',
 'self',
 'social',
 'spirit',
 'sports',
 'stories',
 'study',
 'travel',
 'true']

In [23]:
# Melakukan proses `.fit()` untuk data kategori buku pada dataset
tfidf_matrix = tf.fit_transform(data['category'])
tfidf_matrix.shape

(5492, 65)

In [24]:
# Mengubah keluaran dari fungsi `TfidfVectorizer()` menjadi bentuk matriks
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.52415666, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.96966008, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [25]:
# Membuat dataframe baru yang berisi nilai matriks antara judul buku dengan kategori buku
pd.DataFrame(tfidf_matrix.todense(),
             columns = tf.get_feature_names(),
             index = data['title']).sample(22, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,self,american,criticism,medical,history,photography,cooking,true,education,psychology,...,fitness,performing,help,science,social,authors,stories,language,art,religion
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hawthorne s Short Stories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Heaven s Price,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M is for Mayflower,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Historical Romances,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Big Dead Place,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rebecca s Revival,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Amelia Bedelia Goes Camping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Last Train to Paradise,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pink Box,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
The 13th directorate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Menghitung nilai kesamaan derajat antara judul buku dengan kategori buku dengan fungsi `cosine_similarity()`
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.24445719, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.24445719, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
# Membuat dataframe dari hasil fungsi `cosine_similarity()` dengan baris dan kolom judul buku yang ada dalam dataset
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['title'], columns=data['title'])
print("Shape: ", cosine_sim_df)

cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape:  title                                                 Gilead  Spider s Web  \
title                                                                        
Gilead                                              1.000000       0.00000   
Spider s Web                                        0.000000       1.00000   
The One Tree                                        0.244457       0.00000   
Rage of angels                                      1.000000       0.00000   
Master of the Game                                  0.000000       0.26914   
...                                                      ...           ...   
Aspects of the Novel                                0.233174       0.00000   
Journey to the East                                 0.000000       0.26914   
The Monk Who Sold His Ferrari A Fable About Ful...  0.000000       0.00000   
I Am that                                           0.000000       0.00000   
I m Telling You Stories                             0.00

title,If on a Winter s Night a Traveler,Pink Flamingos and Other Filth,Adiós Hemingway,Port Mungo,Alice s Adventures in Wonderland
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Eyes of the Dragon,1.0,0.0,1.0,1.0,0.450041
Childhood Shadows,0.0,0.0,0.0,0.0,0.0
Fargo Rock City,0.0,0.0,0.0,0.0,0.0
Hunger,1.0,0.0,1.0,1.0,0.450041
The Christmas Shoes,1.0,0.0,1.0,1.0,0.450041
Star Wars,1.0,0.0,1.0,1.0,0.450041
A fine balance,1.0,0.0,1.0,1.0,0.450041
The Story of Avis,0.0,0.0,0.0,0.0,0.0
History of the Peloponnesian War,0.0,0.0,0.0,0.0,0.0
Kate Vaiden,1.0,0.0,1.0,1.0,0.450041


In [28]:
# Mmebuat fungsi baru yang berfungsi untuk menampilkan rekomendasi buku
def books_recommendations(books_name, similarity_data=cosine_sim_df, items=data[['title', 'category']], k=5):
  index = similarity_data.loc[:, books_name].to_numpy().argpartition(range(-1, k, 1))
  closest = similarity_data.columns[index[-1:-(k+2):-1]]
  closest = closest.drop(books_name, errors='ignore')
  return pd.DataFrame(closest).merge(items).head(k)

In [29]:
# Memasukan judul buku yang menjadi acuan untuk rekomendasi buku
data[data.title.eq('The Big Four')]

Unnamed: 0,index,title,category,rating
22,22,The Big Four,Detective and mystery stories,3.59


In [30]:
# Menguji 10 rekomendasi buku berdasarkan buku "The Big Four"
books_recommendations('The Big Four')

Unnamed: 0,title,category
0,The Twenty seventh City,Detective and mystery stories
1,I Am that,Philosophy
2,The Monk Who Sold His Ferrari A Fable About Fu...,Health & Fitness
3,Journey to the East,Adventure stories
4,Aspects of the Novel,English fiction
