In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

In [3]:
x1 = np.array([1, 0, 0]).reshape(1, -1)
x2 = np.array([-1, 0, 0]).reshape(1, -1)
x3 = np.array([0, 1, 0]).reshape(1, -1)
x4 = np.array([1, 0, 0]).reshape(1, -1)

def distance(x1, x2):
    return 1 - cosine_similarity(x1, x2)

distance(x1, x4)

array([[0.]])

# Load Datasets

In [47]:
df = pd.read_csv('GoodReads_100k_books.csv.gz', compression='gzip')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB
None


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [48]:
# Handling empty data

df = df.fillna("").rename({'desc': 'description'}, axis=1)[:10_000]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        10000 non-null  object 
 1   bookformat    10000 non-null  object 
 2   description   10000 non-null  object 
 3   genre         10000 non-null  object 
 4   img           10000 non-null  object 
 5   isbn          10000 non-null  object 
 6   isbn13        10000 non-null  object 
 7   link          10000 non-null  object 
 8   pages         10000 non-null  int64  
 9   rating        10000 non-null  float64
 10  reviews       10000 non-null  int64  
 11  title         10000 non-null  object 
 12  totalratings  10000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 1015.8+ KB


In [49]:
# Item-to-item recommendation system based description only.

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # Lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # Tokenize document
    tokens = nltk.word_tokenize(doc)
    # Filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Re-create docuement from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

# Assuming `df['description']` is the column with text data
descriptions = list(df['description'])

# Use tqdm to show progress bar while normalizing
norm_corpus = []
for doc in tqdm(descriptions, desc="Normalizing descriptions"):
    norm_corpus.append(normalize_document(doc))

print(f"Total normalized documents: {len(norm_corpus)}")

Normalizing descriptions: 100%|████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2387.95it/s]

Total normalized documents: 10000





# Keyword Based Recommendation: Using BM25 Okapi

In [50]:
"""
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""

import math
from six import iteritems
from six.moves import xrange


PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
  """Implementation of Best Matching 25 ranking function.
  Attributes
  ----------
  corpus_size : int
      Size of corpus (number of documents).
  avgdl : float
      Average length of document in `corpus`.
  corpus : list of list of str
      Corpus of documents.
  f : list of dicts of int
      Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
  df : dict
      Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
  idf : dict
      Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
  """

  def __init__(self, corpus):
    """
    Parameters
    ----------
    corpus : list of list of str
      Given corpus.
    """
    self.corpus_size = len(corpus)
    self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
    self.corpus = corpus
    self.f = []
    self.df = {}
    self.idf = {}
    self.doc_len = []
    self.initialize()

  def initialize(self):
    """Calculates frequencies of terms in documents and in corpus.
       Also computes inverse document frequencies."""
    for document in self.corpus:
      frequencies = {}
      self.doc_len.append(len(document))

      for word in document:
        if word not in frequencies:
          frequencies[word] = 0
        frequencies[word] += 1
      self.f.append(frequencies)

      for word, freq in iteritems(frequencies):
        if word not in self.df:
          self.df[word] = 0
        self.df[word] += 1

    for word, freq in iteritems(self.df):
      self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)


  def get_score(self, document, index, average_idf):
    """Computes BM25 score of given `document` in relation to item of corpus
    corpus
      selected by `index`

    Parameters
    ----------
    document : list of str
      Document to be scored.
    index : int
      Index of document in corpus selected to score with `document`.
    average_idf : float
      Average idf in corpus.
    Returns
    -------
    float
      BM25 score.
    """
    score = 0
    for word in document:
      if word not in self.f[index]:
        continue
      idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
      score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B +
                PARAM_B * self.doc_len[index] / self.avgdl)))
    return score

  def get_scores(self, document, average_idf):
    """Computes and returns BM25 scores of given `document` in
    relation to every item in corpus.
    Parameters
    ----------
    document : list of str
      Document to be scored.
    average_idf : float
      Average idf in corpus.
    Returns
    -------
    list of float
      BM25 scores.
    """
    scores = []
    for index in xrange(self.corpus_size):
      score = self.get_score(document, index, average_idf)
      scores.append(score)
    return scores

In [51]:
def get_bm25_weights(corpus):
  """Returns BM25 scores (weights) of documents in corpus.
     Each document has to be weighted with every document in given corpus.

  Parameters
  ----------
  corpus : list of list of str
    Corpus of documents.

  Returns
  -------
  list of list of float
    BM25 scores.

  Examples
  --------
  >>> from gensim.summarization.bm25 import get_bm25_weights
  >>> corpus = [
    ... ["black", "cat", "white", "cat"],
    ... ["cat", "outer", "space"],
    ... ["wag", "dog"]
    ... ]
  >>> result = get_bm25_weights(corpus)
  """
  bm25 = BM25(corpus)
  average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
  weights = []
  for doc in corpus:
    scores = bm25.get_scores(doc, average_idf)
    weights.append(scores)
  return weights


In [52]:
norm_corpus_tokens = list([nltk.word_tokenize(doc) for doc in norm_corpus])

# Display the first three tokenized documents
print(norm_corpus_tokens[:3])

[['reveals', 'several', 'hundred', 'thousand', 'indians', 'affected', 'civil', 'war', 'twenty', 'thousand', 'indians', 'enlisted', 'sides', 'attempt', 'gain', 'legitimacy', 'autonomy', 'simply', 'land'], ['fashion', 'sourcebook', '1920s', 'first', 'book', 'brandnew', 'series', 'fiell', 'publishing', 'documents', 'comprehensively', 'seasonal', 'fashion', 'styles', '20th', 'century', 'decade', 'decade', 'sumptuously', 'illustrated', '600', 'original', 'photographs', 'drawings', 'prints', 'title', 'musthave', 'reference', 'work', 'students', 'fashion', 'fashionistas', 'fashion', 'sourcebook', '1920s', 'focuses', 'art', 'deco', 'period', 'beautiful', 'beaded', 'dresses', 'cloche', 'hats', 'tbar', 'shoes', 'worn', 'fashionable', 'flappers', 'bright', 'young', 'things', 'time', 'accompanying', 'introduction', 'outlines', 'major', 'themes', 'within', 'fashion', 'period', 'introduces', 'famous', 'designers', 'assesses', 'creative', 'contributions', 'text', 'english', 'french', 'german', 'also'

In [53]:
# %%time
# wts = get_bm25_weights(norm_corpus_tokens)

In [54]:
# # Word Embedding

# bm25_wts_df = pd.DataFrame(wts)
# bm25_wts_df.head()

In [55]:
# # Testing

# sample = "Fashion Sourcebook 1920s"

# books_list = df['title'].values
# print(books_list[:10])
# # Find Book ID


# Keyword Based Recommendation: Using TfidfVectorizer

In [56]:
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

(10000, 82396)

**Handling Large Similarity with Compute Row-Wise Similarities**

Instead of building a full matrix, calculate and store only the top-k similarities for each document. This is memory-efficient and aligns with most practical use cases.

In [57]:
tfidf_matrix[0]

<1x82396 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [66]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

top_k = 10  # Keep only top 10 similar documents per document
top_k_similarities = {}

# Total number of documents
num_docs = tfidf_matrix.shape[0]  # Total number of documents
chunk_size = 100  # Adjust based on memory constraints

# Calculate number of chunks
num_chunks = int(np.ceil(num_docs / chunk_size))

# Adjust loop to correctly handle the number of chunks
for i in tqdm(range(num_chunks), desc="Processing rows for top-k similarities"):
    start_i = i * chunk_size
    end_i = min((i + 1) * chunk_size, num_docs)  # Ensure end_i does not exceed num_docs
    
    # Skip empty chunks
    if start_i >= num_docs:
        break
    
    chunk_i = tfidf_matrix[start_i:end_i]
    
    # Compute similarities for the chunk
    similarities = cosine_similarity(chunk_i, tfidf_matrix)
    
    # Extract top-k similar documents for each row
    for idx, row in enumerate(similarities):
        row_idx = start_i + idx
        top_k_indices = np.argsort(row)[-(top_k + 1):-1]  # Indices of top-k similarities
        top_k_values = row[top_k_indices]
        top_k_similarities[row_idx] = list(zip(top_k_indices, top_k_values))


Processing rows for top-k similarities: 100%|████████████████████████████████████████| 100/100 [00:06<00:00, 15.62it/s]


In [67]:
print(len(top_k_similarities))
top_k_similarities

10000


{0: [(1957, 0.09526654048526988),
  (9025, 0.09647535483267751),
  (2151, 0.09739569591521013),
  (8224, 0.0986196736552857),
  (6129, 0.10161241930401158),
  (6581, 0.10727028772477624),
  (3060, 0.11728116175837082),
  (524, 0.12135034032380136),
  (1210, 0.12364829695072695),
  (9182, 0.13590589814666232)],
 1: [(6790, 0.15957181598901896),
  (9075, 0.15973626476692288),
  (7936, 0.1660300081248206),
  (8687, 0.16618312250653317),
  (8059, 0.17078548281761188),
  (3108, 0.17096866608740444),
  (1019, 0.18871794509686352),
  (1083, 0.20290667953942748),
  (7449, 0.22204093003814204),
  (223, 0.25284716527364526)],
 2: [(6115, 0.08221309577092634),
  (7455, 0.08708609465201922),
  (936, 0.08713273629637185),
  (8005, 0.08903261524992902),
  (2689, 0.09113674756159755),
  (1009, 0.093725425451669),
  (146, 0.09569828066342667),
  (6184, 0.09595504619550255),
  (1001, 0.10387602225872128),
  (8587, 0.1385704156711971)],
 3: [(941, 0.10208177805623532),
  (3191, 0.10790007060146159),
  (

In [72]:
# Testing

sample = "Fashion Sourcebook 1920s"

books_list = df['title'].values
# Find Book ID
book_idx = np.where(books_list == sample)[0][0]
print(f"Sample movies {sample} id: {book_idx}")

# Get top 10 similar books
book_similarities_idxs = [item[0] for item in top_k_similarities[book_idx]]
similar_books = books_list[book_similarities_idx]
print("Top 10 Recommendations:")
for i, movie in enumerate(similar_books):
    print(i+1, movie)

Sample movies Fashion Sourcebook 1920s id: 1
Top 10 Recommendations:
1 The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk
2 The Business of Fashion: Designing, Manufacturing and Marketing
3 Pattern-drafting for Fashion: The Basics: The Basics
4 The Art of Dress: Fashion in England and France 1750 to 1820
5 Fashion Flair for Portrait and Wedding Photography
6 Graphic Design for Fashion
7 Figure Drawing for Fashion Design
8 Chronicle of Western Fashion
9 Vintage Fashion
10 Icons of Fashion: The 20th Century


**Handling Large Similarity Metrics with Sparse Matrices**

If most of the similarity values are zero (or near zero), you can use sparse matrices to save memory. Libraries like scipy.sparse are well-suited for this.

In [74]:
import numpy as np
from scipy.sparse import lil_matrix

# Chunk size
chunk_size = 1000
num_chunks = int(np.ceil(tfidf_matrix.shape[0] / chunk_size))

# Sparse matrix to store similarities
full_sim_matrix = lil_matrix((tfidf_matrix.shape[0], tfidf_matrix.shape[0]))

# Compute similarity by chunks
for i in tqdm(range(num_chunks), desc="Processing chunks"):
    start_i = i * chunk_size
    end_i = min((i + 1) * chunk_size, tfidf_matrix.shape[0])
    chunk_i = tfidf_matrix[start_i:end_i]
    
    # Self-similarity for the chunk
    self_sim = 1 - cosine_similarity(chunk_i)
    full_sim_matrix[start_i:end_i, start_i:end_i] = self_sim
    
    # Inter-chunk similarity
    for j in range(i + 1, num_chunks):
        start_j = j * chunk_size
        end_j = min((j + 1) * chunk_size, tfidf_matrix.shape[0])
        chunk_j = tfidf_matrix[start_j:end_j]
        
        inter_sim = 1 - cosine_similarity(chunk_i, chunk_j)
        full_sim_matrix[start_i:end_i, start_j:end_j] = inter_sim
        full_sim_matrix[start_j:end_j, start_i:end_i] = inter_sim.T

# Convert to a compressed sparse format for efficiency
full_sim_matrix = full_sim_matrix.tocsr()

Processing chunks: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.81s/it]


In [80]:
# Testing

sample = "Fashion Sourcebook 1920s"

books_list = df['title'].values
# Find Book ID
book_idx = np.where(books_list == sample)[0][0]
print(f"Sample movies {sample} id: {book_idx}")

# Get Movie Similarities
row = full_sim_matrix[book_idx] # Retrieves the row as a sparse matrix
row_dense = row.toarray().flatten() # Convert row to a dense 1D array

# Get top 10 similar books
book_similarities_idxs = np.argsort(-row_dense)[1:11]
similar_books = books_list[book_similarities_idx]
print("Top 10 Recommendations:")
for i, movie in enumerate(similar_books):
    print(i+1, movie)

Sample movies Fashion Sourcebook 1920s id: 1
Top 10 Recommendations:
1 The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk
2 The Business of Fashion: Designing, Manufacturing and Marketing
3 Pattern-drafting for Fashion: The Basics: The Basics
4 The Art of Dress: Fashion in England and France 1750 to 1820
5 Fashion Flair for Portrait and Wedding Photography
6 Graphic Design for Fashion
7 Figure Drawing for Fashion Design
8 Chronicle of Western Fashion
9 Vintage Fashion
10 Icons of Fashion: The 20th Century
