In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
import re
import json

from tqdm import tqdm

import spacy

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

In [2]:
pd.set_option('display.max_colwidth', 2000)

# Prepare Model For Semantic Based Recommendation

In [3]:
# Load nlp model
nlp = spacy.load('en_core_web_sm')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint).to(device)


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(documents, device=device):
    encoded_input = tokenizer(documents, 
                              padding=True, 
                              truncation=True, 
                              return_tensors='pt')
    if device.type == 'cuda':
        encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    if device.type == 'cuda':
        return sentence_embedding.cpu().numpy()
        
    return sentence_embedding.numpy()



# Load Dataset

In [5]:
df = pd.read_csv('GoodReads_100k_books.csv.gz', compression='gzip')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB
None


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","History,Military History,Civil War,American History,American Civil War,Nonfiction,North American Hi...,American History,Native Americans",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1387738765l/1001053.jpg,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Between_Two_Fires,0,3.52,5,Between Two Fires: American Indians in the Civil War,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,"Fashion Sourcebook - 1920s is the first book in a brand-new series by Fiell Publishing that documents comprehensively the seasonal fashion styles of the 20th century, decade by decade. Sumptuously illustrated with over 600 original photographs, drawings and prints, this title is a must-have reference work for not only students of fashion, but for all fashionistas. Fashion Sourcebook - 1920s focuses on the Art Deco period with its beautiful beaded dresses, cloche hats and t-bar shoes as worn by the fashionable flappers and the ""bright young things"" of the time. An accompanying introduction outlines the major themes within fashion during this period and introduces its most famous designers and assesses their creative contributions. Text in English, French & German. Also Available: Fashion Sourcebook - 1930s ISBN: 9781906863586 24.95""","Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1421011497l/10010552.jpg,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashion-sourcebook-1920s,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","Politics,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1348117708l/1001077.jpg,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungary_56,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participant in the major social reform movements of the Gilded Age, Labadie was a central figure in the pervasive struggle for a new social order as the American Midwest underwent rapid industrialization at the end of the 19th century. This engaging biography follows Labadie's colorful career from a childhood among a Pottawatomi tribe in the Michigan woods through his local and national involvement in a maze of late 19th-century labor and reform activities, including participation in the Socialist Labor party, Knights of Labor, Greenback movement, trades councils, typographical union, eight-hour-day campaigns, and the rise of the American Federation of Labor. In writing this biography of her grandfather, Carlotta R. Anderson consulted the renowned Labadie Collection at the University of Michigan, a unique collection of protest literature which extensively documents pivotal times in American labor history and radical history.","Labor,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1356461214l/1001079.jpg,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_American_Anarchist,324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa table, table surprenante par sa diversitÃ© et son originalitÃ©. Tous initient leurs petits Ã la vie gourmande en puisant dans un panier aux ressources immenses. Pour y parvenir, lâ€™oiseau a modifiÃ© son anatomie, sa morphologie, mais surtout il a radicalement adaptÃ© son organisme Ã ses choix. Par ses photos magnifiques et ses textes fascinants, lâ€™auteur nous invite Ã dÃ©couvrir les innombrables et subtiles facettes de lâ€™alimentation des oiseaux., - ,www.jeanleveille.org",,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1293221069l/10010880.jpg,2761920813,,https://goodreads.com/book/show/10010880-les-oiseaux-gourmands,177,4.0,1,Les oiseaux gourmands,1


In [6]:
# Handling empty data

df = df.fillna("").rename({'desc': 'description'}, axis=1)[:10_000]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        10000 non-null  object 
 1   bookformat    10000 non-null  object 
 2   description   10000 non-null  object 
 3   genre         10000 non-null  object 
 4   img           10000 non-null  object 
 5   isbn          10000 non-null  object 
 6   isbn13        10000 non-null  object 
 7   link          10000 non-null  object 
 8   pages         10000 non-null  int64  
 9   rating        10000 non-null  float64
 10  reviews       10000 non-null  int64  
 11  title         10000 non-null  object 
 12  totalratings  10000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 1015.8+ KB


In [7]:
# Item-to-item recommendation system based description only.

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # Lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # Tokenize document
    tokens = nltk.word_tokenize(doc)
    # Filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Re-create docuement from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

# Assuming `df['description']` is the column with text data
descriptions = list(df['description'])

# Use tqdm to show progress bar while normalizing
norm_corpus = []
for doc in tqdm(descriptions, desc="Normalizing descriptions"):
    norm_corpus.append(normalize_document(doc))

print(f"Total normalized documents: {len(norm_corpus)}")

Normalizing descriptions: 100%|█████████████████████████████████████████████████| 10000/10000 [00:14<00:00, 686.69it/s]

Total normalized documents: 10000





In [8]:
# Extract important words

def preprocessing(text):
    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ']:
            result.append(token.text)
        else:
            continue
    return " ".join(result)

important_corpus = []
for doc in tqdm(descriptions, desc="Extract Important Word Descriptions"):
    important_corpus.append(preprocessing(doc))

print(f"Total important documents: {len(important_corpus)}")

Extract Important Word Descriptions: 100%|███████████████████████████████████████| 10000/10000 [06:25<00:00, 25.96it/s]

Total important documents: 10000





# Prepare Keyword Based Recommendation

In [66]:
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(important_corpus)

tfidf_matrix.shape

(10000, 79672)

In [80]:
def scoring_similarity_keyword(sentence1, sentence2, tfidf=tf):
    return cosine_similarity(tfidf.transform([sentence1])[0],
                                 tfidf.transform([sentence2])[0])[0][0]

# Testing

In [91]:
corpus = [important_corpus]
books_list = df['title'].values
genre_list = df['genre'].values

titles = ["The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk",
          "The Business of Fashion: Designing, Manufacturing and Marketing",
          "Pattern-drafting for Fashion: The Basics: The Basics",
          "The Art of Dress: Fashion in England and France 1750 to 1820",
          "Fashion Flair for Portrait and Wedding Photography",
          "Graphic Design for Fashion",
          "Figure Drawing for Fashion Design",
          "Chronicle of Western Fashion",
          "Vintage Fashion",
          "Icons of Fashion: The 20th Century",
         ]

sample = "Fashion Sourcebook 1920s"
scores = {'genre': [], 'score_keyword': [], 'score_semantic': [], 'score_genre': []}

for c in corpus:
    s = []
    for title in titles:
        desc_1 = c[np.where(books_list==sample)[0][0]]
        desc_2 = c[np.where(books_list==title)[0][0]]
        gen_1 = genre_list[np.where(books_list==sample)[0][0]]
        gen_2 = genre_list[np.where(books_list==title)[0][0]]

        scores['genre'].append(gen_2)
        scores['score_genre'].append(cosine_similarity(get_embeddings(gen_1),
                                                 get_embeddings(gen_2))[0][0])
        scores['score_keyword'].append(scoring_similarity_keyword(desc_1, desc_2))
        scores['score_semantic'].append(cosine_similarity(get_embeddings(desc_1),
                                                    get_embeddings(desc_2))[0][0])


In [93]:
temp = pd.DataFrame(scores, index=titles)
temp['score_ovl'] = 0.5 * temp['score_keyword'] + 0.5 * (0.25 * temp['score_genre'] + 0.75 * temp['score_semantic'])
temp.sort_values(by='score_ovl', ascending=False)

Unnamed: 0,genre,score_keyword,score_semantic,score_genre,score_ovl
Icons of Fashion: The 20th Century,"Couture,Fashion,Nonfiction",0.265076,0.812408,0.943938,0.555183
Chronicle of Western Fashion,"Couture,Fashion,History,Nonfiction,Reference,Reference,Research",0.207279,0.674326,0.911028,0.47039
Vintage Fashion,"Couture,Fashion,Nonfiction,History,Art,Reference",0.216514,0.644016,0.948082,0.468273
Graphic Design for Fashion,"Couture,Fashion,Design,Nonfiction",0.174701,0.591846,0.951263,0.428201
The Art of Dress: Fashion in England and France 1750 to 1820,"Couture,Fashion,History,Literature,18th Century,Nonfiction,Art,Art History,Cultural,France,European Literature,British Literature,Literature,19th Century,Art",0.175039,0.649171,0.756652,0.42554
Figure Drawing for Fashion Design,"Couture,Fashion,Art",0.192598,0.587002,0.774646,0.413256
The Fashion World of Jean Paul Gaultier: From the Sidewalk to the Catwalk,"Couture,Fashion,Art",0.167355,0.614356,0.774646,0.410892
Pattern-drafting for Fashion: The Basics: The Basics,"Crafts,Sewing,Couture,Fashion",0.157673,0.625809,0.531064,0.379898
Fashion Flair for Portrait and Wedding Photography,"Art,Photography,Nonfiction",0.180618,0.504274,0.782738,0.377254
"The Business of Fashion: Designing, Manufacturing and Marketing","Couture,Fashion",0.166307,0.519874,0.662918,0.360971


In [101]:
# Open the file with explicit encoding
with open("testing_reccomendation.json", "r", encoding="utf-8") as f:
    data = json.load(f)

testing = pd.DataFrame.from_dict(data).T
testing

Unnamed: 0,title,genre,description
1,Decades of Fashion,"Fashion, Nonfiction, Art, Photography","From the glamour of the '30s to the extremes of the '80s, from the corsets of the Belle Époque to the kaftans of the '70s, this volume offers an impressive pictorial overview of the fashion world of the entire twentieth century. Chanel’s jersey suits, Dior’s “new look”, Mary Quant’s minis, and Alaïa’s stretch bodies are high fashion developments that also had an impact on everyday clothing. In addition to the great designers, prominent icons like Marlene Dietrich and Madonna provided impulses. People are not the only source of fashion political events also make their mark on what we wear, and thus the book presents many fashions in terms of their historical context. Decades of Fashion captures the world of fashion in hundreds of photographs drawn both from the fashion runways and from everyday life. Brilliant pictures of every situation and area of life relate the exciting history of fashion."
2,1920s Fashion: The Definitive Sourcebook,"Fashion, Nonfiction, Fashion History, Historical, Art, History","Featuring 600 totally original, period photographs and illustrations completely redefining the appraisal of 1920s fashion Inspiration for the vintage trend still dominating high streets and catwalks Includes many re-discovered images of not only Hollywood stars in the latest flapper fashions but also elegant illustrations from the leading fashion houses Contains biographies of renowned designers and fashion houses Companion work to 1930s Fashion: The Definitive Sourcebook ISBN 9781783130153 and 1940s Fashion: The Definitive Sourcebook ISBN 9781847960467 also by Emmanuelle Dirix and Charlotte FiellContaining page after page of fabulous Art Deco fashions, 1920s Fashion documents comprehensively the seasonal fashion styles of the Twenties. Sumptuously illustrated with over 600 original photographs, drawings, and prints, this title is a must-have reference work for not only students of fashion and vintage collectors, but for all fashionistas. 1920s Fashion focuses on the Art Deco period with its beautiful beaded dresses, cloche hats and strappy shoes as worn by the fashionable flappers and the 'bright young things' of the time. The book is divided up into different themed sections, such as daywear, eveningwear, accessories, etc. for easy inspirational reference.Featuring the whole gamut of Art Deco ensembles, from couturier dresses to everyday mail-order fashions, 1920s Fashion includes a fascinating introduction outlining the major themes within fashion during this period, introducing its most famous designers and assessing their creative contributions. A cornucopia of beautiful clothes with exquisite detailing, this book provides a rich source of inspiration through an in-depth and important survey of Art Deco fashion.Contents: Introduction by Emmanuelle Dirix; Catalogue section: Daywear; Outerwear; Eveningwear; Accessories; Other; End matter: Index; Biographies of designers.'"
3,"Vintage Fashion: Collecting and Wearing Designer Classics, 1900-1990","Fashion, Nonfiction, History, Art, Fashion History, Reference","The definitive word on the most influential designers and looks of the twentieth century, Vintage Fashion is a gorgeous resource for fashionistas, vintage clothing fanatics, and designers of all ages and varieties. Whether an aspiring fashion professional or simply a lover of vintage styles, Emma Baxter Wright’s acclaimed guide to wearing and collecting vintage articles is an unmatched and invaluable guide to discovering a more exciting life-of-style today."
4,Art Deco Fashion,"Fashion, Nonfiction, Art, Design, History, Reference","The most glamorous time for fashion in the 20th century--the Jazz Age--shimmers with trademark exuberance in this first-ever compendium of the style of the Art Deco era. From flapper dresses to feathers, fashion exploded during the Roaring '20s, when clothes became a symbol of a more liberated lifestyle and epitomized the glamour and youthful excitement of the Jazz Age. Hemlines and waistlines slowly crept toward each other as the motto for style--and life--became Anything Goes! In Art Deco Fashion the world of Hollywood and F. Scott Fitzgerald comes to life in images of beaded evening dresses for dancing the Charleston; sporty outfits for golf, tennis, and swimming; and clothes designed for traveling in luxury liners, trains, or in streamlined cars. Accented with posters, photographs, and images from fashion magazines of the era, this sumptuous volume presents a thorough and stunning review of Deco fashion."
5,Fashion: The Definitive History of Costume and Style,"Fashion, History, Nonfiction, Art, Reference, Fashion History, Historical, Art Design, Design, Research","Tracing the evolution of fashion — from the early draped fabrics of ancient times to the catwalk couture of today — Fashion: The Definitive History of Costume and Style is a stunningly illustrated guide to more than three thousand years of shifting trends and innovative developments in the world of clothing. Containing everything you need to know about changing fashion and style — from ancient Egyptian dress to Space Age Fashion and Grunge — and information on icons like Marie Antoinette, Clara Bow, Jacqueline Kennedy, and Alexander McQueen, Fashion catalogs the history of what people wear, revealing how Western fashion has been influenced by design from around the world and celebrating costume and haute couture. Fashion will captivate anyone interested in style — whether it's the fashion-mad teen in Tokyo, the wannabe designer in college, or the fashionista intrigued by the violent origins of the stiletto and the birth of bling."
6,100 Years of Fashion Illustration,"Fashion, Art, Nonfiction, History, Design, Fashion History, Reference, Fashion Design, Art Design","A visual feast of 400 dazzling images, this is a comprehensive survey of the genre over the last century. The book also offers an overview of the development of fashion, as seen through the eyes of the greatest illustrators of the day. Early in the century fashion illustration reflected new, liberating currents in art and culture, such as the exoticism of the Ballets Russes, while the postwar period saw inspiration from the great Parisian couturiers. After the dominance of the celebrity fashion photographer in the '60s, a new generation of illustrators emerged, embracing the medium of the computer, while many returned to more traditional techniques."


In [103]:
sample = "Fashion Sourcebook 1920s"
base_desc = important_corpus[np.where(books_list==sample)[0][0]]
base_genre = genre_list[np.where(books_list==sample)[0][0]]

scores = {'genre': [], 'score_keyword': [], 'score_semantic': [], 'score_genre': []}

for _, row in testing.iterrows():
    scores['genre'].append(row['genre'])
    scores['score_keyword'].append(scoring_similarity_keyword(base_desc, row['description']))
    scores['score_semantic'].append(cosine_similarity(get_embeddings(base_desc),
                                                      get_embeddings(row['description']))[0][0])
    scores['score_genre'].append(cosine_similarity(get_embeddings(base_genre),
                                                      get_embeddings(row['genre']))[0][0])

In [109]:
temp = pd.DataFrame(scores, index=testing['title'].values)
temp['score_ovl'] = 0.25 * temp['score_keyword'] + 0.75 * (0.25 * temp['score_genre'] + 0.75 * temp['score_semantic'])
temp.sort_values(by='score_ovl', ascending=False)

Unnamed: 0,genre,score_keyword,score_semantic,score_genre,score_ovl
1920s Fashion: The Definitive Sourcebook,"Fashion, Nonfiction, Fashion History, Historical, Art, History",0.4179,0.803752,0.877079,0.721038
Decades of Fashion,"Fashion, Nonfiction, Art, Photography",0.126531,0.739505,0.858426,0.608559
Art Deco Fashion,"Fashion, Nonfiction, Art, Design, History, Reference",0.171482,0.654879,0.885364,0.577246
Fashion: The Definitive History of Costume and Style,"Fashion, History, Nonfiction, Art, Reference, Fashion History, Historical, Art Design, Design, Research",0.158078,0.677165,0.793478,0.569202
"Vintage Fashion: Collecting and Wearing Designer Classics, 1900-1990","Fashion, Nonfiction, History, Art, Fashion History, Reference",0.104215,0.658455,0.851357,0.556064
100 Years of Fashion Illustration,"Fashion, Art, Nonfiction, History, Design, Fashion History, Reference, Fashion Design, Art Design",0.074249,0.584929,0.787087,0.495163
