In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
import re
from tqdm import tqdm

import spacy

from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Prepare Model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint).to(device)


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(documents, device=device):
    encoded_input = tokenizer(documents, 
                              padding=True, 
                              truncation=True, 
                              return_tensors='pt')
    if device.type == 'cuda':
        encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    if device.type == 'cuda':
        return sentence_embedding.cpu().numpy()
        
    return sentence_embedding.numpy()



In [3]:
# Load nlp model
nlp = spacy.load('en_core_web_sm')

# Load Dataset

In [4]:
df = pd.read_csv('GoodReads_100k_books.csv.gz', compression='gzip')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB
None


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [5]:
# Handling empty data

df = df.fillna("").rename({'desc': 'description'}, axis=1)[:10_000]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        10000 non-null  object 
 1   bookformat    10000 non-null  object 
 2   description   10000 non-null  object 
 3   genre         10000 non-null  object 
 4   img           10000 non-null  object 
 5   isbn          10000 non-null  object 
 6   isbn13        10000 non-null  object 
 7   link          10000 non-null  object 
 8   pages         10000 non-null  int64  
 9   rating        10000 non-null  float64
 10  reviews       10000 non-null  int64  
 11  title         10000 non-null  object 
 12  totalratings  10000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 1015.8+ KB


In [6]:
# Item-to-item recommendation system based description only.

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # Lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # Tokenize document
    tokens = nltk.word_tokenize(doc)
    # Filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Re-create docuement from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

# Assuming `df['description']` is the column with text data
descriptions = list(df['description'])

# Use tqdm to show progress bar while normalizing
norm_corpus = []
for doc in tqdm(descriptions, desc="Normalizing descriptions"):
    norm_corpus.append(normalize_document(doc))

print(f"Total normalized documents: {len(norm_corpus)}")

Normalizing descriptions: 100%|█████████████████████████████████████████████████| 10000/10000 [00:12<00:00, 784.99it/s]

Total normalized documents: 10000





# Semantic Based Recommendation

In [12]:
# Extract important words

def preprocessing(text):
    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ']:
            result.append(token.text)
        else:
            continue
    return " ".join(result)

important_corpus = []
for doc in tqdm(descriptions, desc="Extract Important Word Descriptions"):
    important_corpus.append(preprocessing(doc))

print(f"Total important documents: {len(important_corpus)}")

Extract Important Word Descriptions: 100%|███████████████████████████████████████| 10000/10000 [07:21<00:00, 22.64it/s]

Total important documents: 10000





In [31]:
corpus = [important_corpus, norm_corpus]
books_list = df['title'].values

titles = ["The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk",
          "The Business of Fashion: Designing, Manufacturing and Marketing",
          "Pattern-drafting for Fashion: The Basics: The Basics",
          "The Art of Dress: Fashion in England and France 1750 to 1820",
          "Fashion Flair for Portrait and Wedding Photography",
          "Graphic Design for Fashion",
          "Figure Drawing for Fashion Design",
          "Chronicle of Western Fashion",
          "Vintage Fashion",
          "Icons of Fashion: The 20th Century",
         ]

sample = "Fashion Sourcebook 1920s"
scores = []

for c in corpus:
    s = []
    for title in titles:
        desc_1 = c[np.where(books_list==sample)[0][0]]
        desc_2 = c[np.where(books_list==title)[0][0]]
        scoring = cosine_similarity(get_embeddings(desc_1), get_embeddings(desc_2))[0][0]
        s.append(scoring)
    scores.append(s)

In [32]:
pd.DataFrame(np.array(scores).T, columns=['important', 'normal'], index=titles)

Unnamed: 0,important,normal
The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk,0.614356,0.680296
"The Business of Fashion: Designing, Manufacturing and Marketing",0.519874,0.4853
Pattern-drafting for Fashion: The Basics: The Basics,0.625809,0.622278
The Art of Dress: Fashion in England and France 1750 to 1820,0.649171,0.637237
Fashion Flair for Portrait and Wedding Photography,0.504274,0.447996
Graphic Design for Fashion,0.591846,0.606061
Figure Drawing for Fashion Design,0.587002,0.542397
Chronicle of Western Fashion,0.674326,0.689146
Vintage Fashion,0.644016,0.626385
Icons of Fashion: The 20th Century,0.812408,0.779049
