In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
# Loading the dataset
def loaddata():
    df = pd.read_csv(f'/content/drive/MyDrive/keywords.csv', sep=',', encoding='latin-1')
    return df

keywords   = loaddata()

In [None]:
keywords.shape

(9959, 11)

In [None]:
# Vectorize the keywords summary using TF-IDF
# TFIDFVectorizer computer the TF_TDF score for each term in each doc
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = 'word',
                        min_df=3,
                        max_df = 0.6,
                        stop_words="english",
                        encoding = 'utf-8',
                        token_pattern=r"(?u)\S\S+")
tfidf_encoding = tfidf.fit_transform(keywords["keywords"])

In [None]:
tfidf_encoding.shape

(9959, 6328)

In [None]:
print(tfidf.get_feature_names_out()[1:100])

In [None]:
# sparse matrix where ros represent books and columns represent the term and each entry corresponds to the numeric representation of the term,
# suitabe for ML purposes
tfidf_encoding

<9959x6328 sparse matrix of type '<class 'numpy.float64'>'
	with 137323 stored elements in Compressed Sparse Row format>

In [None]:
tfidf_encoding.toarray().shape

(9959, 6328)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# calculates the cosine similarity between tf-idf vectors of the book
book_cosine_sim = cosine_similarity(tfidf_encoding, tfidf_encoding)

In [None]:
# Preview Similarity Matrix
book_cosine_sim.shape
# print(book_cosine_sim)

(9959, 9959)

Recommendation

In [None]:
books = pd.Series(keywords['title'])

def recommend_books_similar_to(book_name, n=5, cosine_sim_mat=book_cosine_sim):
    # get index of the imput book
    input_idx = books[books == book_name].index[0]
    print("index id: ",input_idx)
    # Find top n similar books with decreasing order of similarity score
    top_n_books_idx = list(pd.Series(cosine_sim_mat[input_idx]).sort_values(ascending = False).iloc[1:n+1].index)
    print("cosine matrix: ",pd.Series(cosine_sim_mat[input_idx]))
    # [1:6] to exclude 0 (index 0 is the input movie itself)
    print("book indices: ",top_n_books_idx)

    books_list = list(books)
    recommended_books = [books[i] for i in top_n_books_idx]

    return recommended_books

In [None]:
recomm = recommend_books_similar_to("Read the Bible for a Change: Understanding and Responding to God's Word", 3)
recomm

index id:  1493
cosine matrix:  0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.007343
          ...   
9954    0.000000
9955    0.009286
9956    0.000000
9957    0.009778
9958    0.000000
Length: 9959, dtype: float64
book indices:  [5968, 1607, 9219]


['A High View of Scripture?: The Authority of the Bible and the Formation of the New Testament Canon',
 'Words of Delight: A Literary Introduction to the Bible',
 'Dominion and Dynasty: A Theology of the Hebrew Bible']

In [None]:
def calculate_precision_recall(recommended_books, ground_truth_books):
    # Convert the lists to sets for efficient intersection calculation
    recommended_set = set(recommended_books)
    ground_truth_set = set(ground_truth_books)

    # Calculate Precision and Recall
    intersection = recommended_set.intersection(ground_truth_set)
    precision = len(intersection) / len(recommended_set) if len(recommended_set) > 0 else 0
    recall = len(intersection) / len(ground_truth_set) if len(ground_truth_set) > 0 else 0

    return precision, recall

# Hypothetical ground truth books that should be recommended to a user
ground_truth_books = ["A Light to the Nations: The Missional Church and the Biblical Story", "Read the Bible for a Change: Understanding and Responding to God's Word",
                      "The HarperCollins Study Bible: Fully Revised & Updated","Why Our Church Switched to the ESV","Where To Find It In The Bible The Ultimate A To Z Resource",
                      "Words of Delight: A Literary Introduction to the Bible"]

# Example: Evaluate the performance of your recommendation system
recommended_books = recommend_books_similar_to("A High View of Scripture?: The Authority of the Bible and the Formation of the New Testament Canon", 10)

precision, recall = calculate_precision_recall(recommended_books, ground_truth_books)
print(f'Precision: {precision}')
print(f'Recall: {recall}')


Precision: 0.4
Recall: 0.6666666666666666
