# **Sistem Rekomendasi Buku Berbasis Content-Based Filtering Menggunakan Cosine Similarity untuk Menyediakan Saran Bacaan Berdasarkan Kesamaan Konten**

## **Instalasi Pustaka yang Dibutuhkan**

In [None]:
pip install pandas scikit-learn nltk




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download stopwords dan punkt (untuk tokenisasi) dari nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## **Memuat dataset [7K Books](https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata) yang sebelumnya telah diunduh dari Kaggle.**

In [None]:
# Load dataset 7k Books
df = pd.read_csv('/content/drive/MyDrive/KELAS/SEMESTER 7/Pengantar Recommender System/UTS/archive (1)/books.csv')

# Tampilkan beberapa baris pertama untuk memastikan data
print(df.head())


          isbn13      isbn10           title subtitle  \
0  9780002005883  0002005883          Gilead      NaN   
1  9780002261982  0002261987    Spider's Web  A Novel   
2  9780006163831  0006163831    The One Tree      NaN   
3  9780006178736  0006178731  Rage of angels      NaN   
4  9780006280897  0006280897  The Four Loves      NaN   

                           authors                     categories  \
0               Marilynne Robinson                        Fiction   
1  Charles Osborne;Agatha Christie  Detective and mystery stories   
2             Stephen R. Donaldson               American fiction   
3                   Sidney Sheldon                        Fiction   
4              Clive Staples Lewis                 Christian life   

                                           thumbnail  \
0  http://books.google.com/books/content?id=KQZCP...   
1  http://books.google.com/books/content?id=gA5GP...   
2  http://books.google.com/books/content?id=OmQaw...   
3  http://books.go

## **Preprocessing Teks**

In [None]:
# Inisialisasi objek untuk stemming dan lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Data cleaning - menghapus nilai NaN pada kolom 'title', 'description', dan 'categories'
df = df.dropna(subset=['title', 'description', 'categories'])

# Menghapus data duplikat berdasarkan kolom 'title', 'description', dan 'categories'
df = df.drop_duplicates(subset=['title'])

In [None]:
import pandas as pd

# Menghitung jumlah buku per kategori
category_counts = df['categories'].value_counts()

# Menyaring kategori dengan lebih dari 100 buku
valid_categories = category_counts[category_counts > 100].index

# Filter dataset hanya untuk kategori valid
df = df[df['categories'].isin(valid_categories)]

# Kurangi jumlah buku dalam kategori yang lebih dari 500
def limit_books_in_large_categories(df, max_books=500):
    limited_dfs = []
    for category, group in df.groupby('categories'):
        if len(group) > max_books:
            # Ambil sampel secara acak hingga jumlah maksimal
            limited_dfs.append(group.sample(n=max_books, random_state=42))
        else:
            limited_dfs.append(group)
    return pd.concat(limited_dfs, ignore_index=True)

df = limit_books_in_large_categories(df, max_books=500)

# Hitung ulang kategori yang tersisa beserta jumlah bukunya
print("Kategori yang digunakan setelah pembersihan:")
print(df['categories'].value_counts())


Kategori yang digunakan setelah pembersihan:
categories
Fiction                      500
Juvenile Fiction             500
Biography & Autobiography    377
History                      252
Literary Criticism           155
Philosophy                   146
Comics & Graphic Novels      140
Religion                     132
Drama                        112
Juvenile Nonfiction          108
Name: count, dtype: int64


In [None]:
# Fungsi preprocessing teks
def preprocess_text(text):
    # Case folding - ubah teks menjadi lowercase
    text = str(text).lower()

    # Removal of URLs dan hashtags
    text = re.sub(r'http\S+|www\S+|https\S+|@\S+|#\S+', '', text)

    # Tokenization
    words = word_tokenize(text)

    # Removal punctuation & hanya simpan alfanumerik
    words = [word for word in words if word.isalnum()]

    # Stopword removal
    words = [word for word in words if word not in stop_words]

    # Stemming dan Lemmatization
    words = [stemmer.stem(word) for word in words]          # Stemming
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization

    # Text normalization - kata-kata khusus yang sering muncul disesuaikan agar lebih formal
    normalization_dict = {
        "n't": "not", "’re": "are", "’s": "is", "’d": "would", "’ll": "will", "’t": "not", "’ve": "have", "’m": "am"
    }
    words = [normalization_dict[word] if word in normalization_dict else word for word in words]

    return ' '.join(words)

# Terapkan preprocessing pada kolom 'title', 'description', dan 'categories'
df['processed_title'] = df['title'].apply(preprocess_text)
df['processed_description'] = df['description'].apply(preprocess_text)
df['processed_categories'] = df['categories'].apply(preprocess_text)

# Gabungkan kolom 'processed_title', 'processed_description', dan 'categories' untuk analisis konten
df['content'] = df['processed_title'] + ' ' + df['processed_description'] + ' ' + df['processed_categories']

# Tampilkan hasil
df[['title', 'processed_title', 'description', 'processed_description', 'categories', 'processed_categories', 'content']].head()

Unnamed: 0,title,processed_title,description,processed_description,categories,processed_categories,content
0,The Real Trial of Oscar Wilde,real trial oscar wild,Oscar Wilde had one of literary history's most...,oscar wild one literari histori explos love af...,Biography & Autobiography,biographi autobiographi,real trial oscar wild oscar wild one literari ...
1,A Year in the Life of William Shakespeare,year life william shakespear,1599 was an epochal year for Shakespeare and E...,1599 epoch year shakespear england shakespear ...,Biography & Autobiography,biographi autobiographi,year life william shakespear 1599 epoch year s...
2,Travels,travel,Often I feel I go to some distant region of th...,often feel go distant region world remind real...,Biography & Autobiography,biographi autobiographi,travel often feel go distant region world remi...
3,Walt Whitman,walt whitman,"Whitman's genius, passions, poetry, and androg...",whitman geniu passion poetri androgyn sensibl ...,Biography & Autobiography,biographi autobiographi,walt whitman whitman geniu passion poetri andr...
4,How to Make Love Like a Porn Star,make love like porn star,When the stewardess brought me off the plane i...,stewardess brought plane wheelchair lower head...,Biography & Autobiography,biographi autobiographi,make love like porn star stewardess brought pl...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2422 entries, 0 to 2421
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   isbn13                 2422 non-null   int64  
 1   isbn10                 2422 non-null   object 
 2   title                  2422 non-null   object 
 3   subtitle               900 non-null    object 
 4   authors                2385 non-null   object 
 5   categories             2422 non-null   object 
 6   thumbnail              2374 non-null   object 
 7   description            2422 non-null   object 
 8   published_year         2422 non-null   float64
 9   average_rating         2416 non-null   float64
 10  num_pages              2416 non-null   float64
 11  ratings_count          2416 non-null   float64
 12  processed_title        2422 non-null   object 
 13  processed_description  2422 non-null   object 
 14  processed_categories   2422 non-null   object 
 15  cont

## **TF-IDF dan Cosine Similarity**

In [None]:
# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transformasi teks pada kolom 'content' menjadi vektor TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

# Hitung cosine similarity antar buku
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Tampilkan hasil cosine similarity antar dua buku pertama
print(cosine_sim[0, 1])


0.025113356206671936


**Menampilkan Buku dengan Cosine Similarity Tertinggi**

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Menemukan indeks dari similarity maksimal (mengabaikan diagonal)
# Diagonal adalah similarity buku dengan dirinya sendiri, jadi kita set menjadi -inf untuk mengabaikannya
np.fill_diagonal(cosine_sim, -np.inf)

# Temukan indeks dari cosine similarity terbesar
max_sim_idx = np.unravel_index(np.argmax(cosine_sim), cosine_sim.shape)

# Ambil pasangan buku dengan similarity terbesar
book1_idx, book2_idx = max_sim_idx

# Tampilkan judul kedua buku yang memiliki similarity terbesar
book1_title = df.iloc[book1_idx]['title']
book2_title = df.iloc[book2_idx]['title']
max_similarity = cosine_sim[book1_idx, book2_idx]

print(f"The two most similar books are '{book1_title}' and '{book2_title}' with a cosine similarity of {max_similarity:.4f}.")


The two most similar books are 'The Diary of Virginia Woolf' and 'The Diary of Virginia Woolf: 1931-1935' with a cosine similarity of 1.0000.


## **Fungsi untuk Rekomendasi Berdasarkan Keyword Inputan User**

### **1. Menampilkan Rekomendasi Berdasarkan Top 10 Cosine Similarity**

In [None]:
# Fungsi rekomendasi berdasarkan keyword
def get_recommendations_by_keyword(keyword, cosine_sim=cosine_sim):
    # Preprocess kata kunci input
    processed_keyword = preprocess_text(keyword)

    # Transform kata kunci menjadi vektor TF-IDF
    keyword_vector = tfidf_vectorizer.transform([processed_keyword])

    # Hitung cosine similarity antara kata kunci dan semua buku
    sim_scores = cosine_similarity(keyword_vector, tfidf_matrix).flatten()

    # Urutkan buku berdasarkan skor similarity
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Ambil 10 buku teratas yang paling mirip
    top_books_indices = [i[0] for i in sim_scores[:10]]

    # Ambil detail buku yang direkomendasikan
    recommended_books = df[['isbn13', 'title', 'description', 'categories']].iloc[top_books_indices]

    # Tambahkan kolom untuk nilai cosine similarity
    recommended_books['cosine_similarity'] = [i[1] for i in sim_scores[:10]]

    # Kembalikan hasil buku yang direkomendasikan dengan nilai cosine similarity
    return recommended_books

# Contoh uji dengan kata kunci tertentu
keyword = "art"
recommended_books = get_recommendations_by_keyword(keyword)
print(recommended_books)

             isbn13                      title  \
1913  9780439736466  Where We Are, what We See   
2144  9780024266408           The Presocratics   
2282  9781590302255             The Art of War   
2279  9781570623950                    Lao Tzu   
2397  9781567996036                     Angels   
1056  9780143036395          The Stone Diaries   
2192  9780394704371          The Will to Power   
59    9780156027915           A Writer's Diary   
1535  9780786808397        Baby Einstein: Dogs   
1556  9780786808373       Baby Einstein: Birds   

                                            description  \
1913  A collection of poems, stories, essays, and ar...   
2144                     Library of Liberal Arts title.   
2282  Written around the 6th century BC, The Art of ...   
2279  A new translation of the classic "Book of the ...   
2397  A pictorial examination of the use of angels i...   
1056  Carol Shields' The Stone Diaries irrevocably t...   
2192  Offers a selection from the au

### **Evaluasi Sistem dengan Precision dan Recall**

In [None]:
def evaluate_recommendations(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, k=10, threshold=0.3):
    """
    Evaluasi hasil rekomendasi berdasarkan precision dan recall.

    Parameters:
        keyword (str): Kata kunci untuk pencarian rekomendasi.
        df (DataFrame): DataFrame yang berisi informasi buku.
        cosine_sim (array): Matriks cosine similarity.
        tfidf_vectorizer (TfidfVectorizer): Model TF-IDF yang sudah dilatih.
        tfidf_matrix (sparse matrix): Matriks TF-IDF dari konten buku.
        k (int): Jumlah buku rekomendasi yang diambil.
        threshold (float): Ambang batas untuk cosine similarity agar dianggap relevan.

    Returns:
        precision (float): Nilai precision dari rekomendasi.
        recall (float): Nilai recall dari rekomendasi.
    """
    # 1. Ambil hasil rekomendasi dari fungsi get_recommendations_by_keyword
    recommended_books_df = get_recommendations_by_keyword(keyword, cosine_sim)
    recommended_books_df = recommended_books_df.head(k)  # Ambil K rekomendasi teratas
    recommended_books = recommended_books_df['isbn13'].tolist()

    # 2. Identifikasi buku relevan berdasarkan keyword
    # Preprocess keyword
    processed_keyword = preprocess_text(keyword)
    keyword_vector = tfidf_vectorizer.transform([processed_keyword])
    sim_scores = cosine_similarity(keyword_vector, tfidf_matrix).flatten()

    # Ambil buku yang relevan berdasarkan threshold similarity
    relevant_books_indices = [i for i, score in enumerate(sim_scores) if score >= threshold]
    relevant_books = df['isbn13'].iloc[relevant_books_indices].tolist()

    # 3. Hitung precision dan recall
    if not relevant_books:
        print("Tidak ada buku relevan ditemukan dengan threshold similarity.")
        return 0.0, 0.0

    true_positives = len(set(recommended_books) & set(relevant_books))
    precision = true_positives / len(recommended_books) if recommended_books else 0
    recall = true_positives / len(relevant_books) if relevant_books else 0

    # 4. Tampilkan hasil evaluasi
    print(f"Buku relevan ({len(relevant_books)}): {relevant_books}")
    print(f"Buku rekomendasi ({len(recommended_books)}): {recommended_books}")
    print(f"True Positives: {true_positives}")

    return precision, recall


# Contoh evaluasi dengan ambang similarity 0.7
precision, recall = evaluate_recommendations(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, k=10, threshold=0.3)
print(f"Precision: {precision}, Recall: {recall}")


Buku relevan (3): [9780439736466, 9780024266408, 9781590302255]
Buku rekomendasi (10): [9780439736466, 9780024266408, 9781590302255, 9781570623950, 9781567996036, 9780143036395, 9780394704371, 9780156027915, 9780786808397, 9780786808373]
True Positives: 3
Precision: 0.3, Recall: 1.0


### **2. Menampilkan Rekomendasi Berdasarkan Threshold=0,3**

In [None]:
def get_recommendations_by_keyword(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, threshold=0.3):
    """
    Fungsi untuk mendapatkan rekomendasi buku berdasarkan kata kunci.

    Parameters:
        keyword (str): Kata kunci untuk pencarian.
        df (DataFrame): DataFrame yang berisi informasi buku.
        cosine_sim (array): Matriks cosine similarity.
        tfidf_vectorizer (TfidfVectorizer): Model TF-IDF yang sudah dilatih.
        tfidf_matrix (sparse matrix): Matriks TF-IDF dari konten buku.
        threshold (float): Ambang batas untuk cosine similarity agar dianggap relevan.

    Returns:
        recommended_books (DataFrame): DataFrame berisi buku yang direkomendasikan dengan cosine similarity.
    """
    # Preprocess kata kunci input
    processed_keyword = preprocess_text(keyword)

    # Transform kata kunci menjadi vektor TF-IDF
    keyword_vector = tfidf_vectorizer.transform([processed_keyword])

    # Hitung cosine similarity antara kata kunci dan semua buku
    sim_scores = cosine_similarity(keyword_vector, tfidf_matrix).flatten()

    # Filter buku yang similarity-nya melebihi threshold
    recommended_indices = [i for i, score in enumerate(sim_scores) if score >= threshold]

    # Urutkan berdasarkan cosine similarity secara descending
    top_recommendations = sorted(recommended_indices, key=lambda x: sim_scores[x], reverse=True)

    # Ambil detail buku yang memenuhi kriteria
    recommended_books = df[['isbn13', 'title', 'description', 'categories']].iloc[top_recommendations].copy()
    recommended_books['cosine_similarity'] = [sim_scores[i] for i in top_recommendations]

    return recommended_books


# Contoh uji dengan kata kunci tertentu
keyword = "art"
recommended_books = get_recommendations_by_keyword(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, threshold=0.3)
print(recommended_books)


             isbn13                      title  \
1913  9780439736466  Where We Are, what We See   
2144  9780024266408           The Presocratics   
2282  9781590302255             The Art of War   

                                            description           categories  \
1913  A collection of poems, stories, essays, and ar...  Juvenile Nonfiction   
2144                     Library of Liberal Arts title.           Philosophy   
2282  Written around the 6th century BC, The Art of ...           Philosophy   

      cosine_similarity  
1913           0.374073  
2144           0.336053  
2282           0.317158  


### **Evaluasi Sistem dengan Precision dan Recall**

In [None]:
from sklearn.metrics import precision_score, recall_score

def evaluate_recommendations(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, threshold=0.7):
    """
    Evaluasi rekomendasi berdasarkan Precision dan Recall.

    Parameters:
        keyword (str): Kata kunci untuk mencari rekomendasi.
        df (DataFrame): DataFrame berisi informasi buku.
        cosine_sim (array): Matriks cosine similarity.
        tfidf_vectorizer (TfidfVectorizer): Model TF-IDF yang sudah dilatih.
        tfidf_matrix (sparse matrix): Matriks TF-IDF dari konten buku.
        threshold (float): Ambang batas cosine similarity untuk dianggap relevan.

    Returns:
        precision (float): Nilai Precision dari hasil rekomendasi.
        recall (float): Nilai Recall dari hasil rekomendasi.
    """
    # Ambil hasil rekomendasi berdasarkan keyword
    recommended_books = get_recommendations_by_keyword(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, threshold)
    recommended_isbns = set(recommended_books['isbn13'])

    # Proses kata kunci untuk mencari buku relevan di dataset
    processed_keyword = preprocess_text(keyword)
    keyword_vector = tfidf_vectorizer.transform([processed_keyword])
    sim_scores = cosine_similarity(keyword_vector, tfidf_matrix).flatten()

    # Ambil buku relevan berdasarkan cosine similarity >= threshold
    relevant_indices = [i for i, score in enumerate(sim_scores) if score >= threshold]
    relevant_isbns = set(df.iloc[relevant_indices]['isbn13'])

    # Evaluasi Precision dan Recall
    true_positives = recommended_isbns.intersection(relevant_isbns)
    precision = len(true_positives) / len(recommended_isbns) if recommended_isbns else 1.0
    recall = len(true_positives) / len(relevant_isbns) if relevant_isbns else 1.0

    return precision, recall

# Contoh uji evaluasi
precision, recall = evaluate_recommendations(keyword, df, cosine_sim, tfidf_vectorizer, tfidf_matrix, threshold=0.3)
print(f"Precision: {precision:}, Recall: {recall:}")


Precision: 1.0, Recall: 1.0


## **Menyimpan Model TF-IDF dan Matriks Cosine Similarity**

In [None]:
import pickle

# Simpan TF-IDF model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Simpan cosine similarity matrix
with open('cosine_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)
