In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_mistralai import MistralAIEmbeddings
from langchain_chroma import Chroma # opensource vector database

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')

In [4]:
books["tagged description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5193    9788172235222 On A Train Journey Home To North...
5194    9788173031014 This book tells the tale of a ma...
5195    9788179921623 Wisdom to Create a Life of Passi...
5196    9788185300535 This collection of the timeless ...
5197    9789027712059 Since the three volume edition o...
Name: tagged description, Length: 5198, dtype: object

In [5]:
books["tagged description"].to_csv("tagged_descriptions.txt", index=False, header=False)

In [6]:
raw_document = TextLoader("tagged_descriptions.txt", encoding="utf-8").load()

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_document)

Created a chunk of size 2010, which is longer than the specified 1500
Created a chunk of size 1637, which is longer than the specified 1500
Created a chunk of size 2012, which is longer than the specified 1500
Created a chunk of size 2834, which is longer than the specified 1500
Created a chunk of size 2510, which is longer than the specified 1500
Created a chunk of size 1814, which is longer than the specified 1500
Created a chunk of size 1830, which is longer than the specified 1500
Created a chunk of size 1644, which is longer than the specified 1500
Created a chunk of size 1932, which is longer than the specified 1500
Created a chunk of size 2008, which is longer than the specified 1500
Created a chunk of size 2285, which is longer than the specified 1500
Created a chunk of size 1914, which is longer than the specified 1500
Created a chunk of size 2616, which is longer than the specified 1500
Created a chunk of size 1580, which is longer than the specified 1500
Created a chunk of s

In [10]:
documents[0]

Document(metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, 

In [None]:
embedding = MistralAIEmbeddings(
    model="mistral-embed",
    api_key= os.getenv("MISTRALAI_API_KEY")
)

db_books = Chroma.from_documents(documents, embedding=embedding) 

In [16]:
query = "A book about love"
docs = db_books.similarity_search(query, k=5)

In [17]:
docs

[Document(id='73849b9e-48ca-4f18-b17f-5d7445233ade', metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780140442755 Set in the idyllic countryside outside Athens, the Phraedrusis a dialogue between the philosopher Socrates and his friend Phaedrus, inspired by their reading of a clumsy speech by the writer Lysias on the nature of love. Their conversation develops into a wide-ranging discussion on such subjects as the pursuit of beauty, the immortality of the soul and the attainment of truth, and ends with an in-depth consideration of the principles of rhetoric. Probably a work of Plato\'s maturity, the Phaedrusrepresents a high point in his achievement as a writer. This volume also contains two of his letters, which discuss his involvement in politics, in particular his role as adviser to Dionysius II of Syracuse, which are crucial documents for our understanding of Plato\'s life and career."\n"9780140443073 In 1818, when he was in his mid-thirties, Stendhal met and fell p

In [18]:
query = "A book about medicine and psychology"
docs = db_books.similarity_search(query, k=5)
docs

[Document(id='8a070593-8b36-4123-86ce-7cd98485cc97', metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780520231511 ""Undertaker of the Mind is the most splendid piece of original research for many a year on the early history of British psychiatry. Brilliantly exploiting hitherto unused documentation, Andrews and Scull bring the once murky world of the eighteenth- century mad-doctor to life, and dispel many deeply embedded myths in the process. Absolutely essential reading!""--Roy Porter, author of The Creation of the Modern World ""This is a wonderfully well-written work... The authors reconstruct, in rich and convincing detail, the dilemmas faced by Monro, his patients, their families, and the broader culture when confronted with psychological distress.""--Joel Braslow, author of Mental Ills and Bodily Cures ""A telling reconstruction of the ideas and practice of probably the most famous psychiatrist in eighteenth-century Britain.... The analyses of Monro\'s more famous

In [19]:
# lets return the titles and authors using the isbn in the tagged descriptons

# extract the isbn from the returned docs and convert it to int ( remove the initial ")
isbn =  docs[0].page_content.split()[0].strip()
isbn = int(isbn[1:])

books[books["isbn13"] == isbn]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description
2584,9780520231511,520231511,Undertaker of the Mind,Jonathan Andrews;Andrew T. Scull,Biography & Autobiography,http://books.google.com/books/content?id=_zYTn...,"""Undertaker of the Mind is the most splendid p...",2001.0,3.42,386.0,12.0,Undertaker of the Mind: John Monro and Mad-doc...,"9780520231511 ""Undertaker of the Mind is the m..."


In [13]:
# Recommendation System
def retrive_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    results = []
    docs = db_books.similarity_search(query, top_k)
    for i in range(len(docs)):
        results += [int(docs[i].page_content.strip('"').split()[0])]
    
    return books[books["isbn13"].isin(results)].head(top_k) # stopped at 1:15 -- finished the theory

In [14]:
retrive_semantic_recommendation("A book about Friendship", 10)


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description
71,9780007189953,7189958,Where Rainbows End,Cecelia Ahern,Friendship,http://books.google.com/books/content?id=PA7t6...,The new warm and absorbing story from the auth...,2004.0,3.94,454.0,642.0,Where Rainbows End,9780007189953 The new warm and absorbing story...
199,9780060734015,60734019,Bridge to Terabithia (rack),Katherine Paterson,Juvenile Fiction,http://books.google.com/books/content?id=LgLT0...,Jess Aarons' greatest ambition is to be the fa...,2004.0,3.99,191.0,1764.0,Bridge to Terabithia (rack),9780060734015 Jess Aarons' greatest ambition i...
215,9780060760441,60760443,The Reading Group,Elizabeth Noble,Fiction,http://books.google.com/books/content?id=IagWj...,The Reading Group follows the trials and tribu...,2005.0,3.34,429.0,6408.0,The Reading Group: A Novel,9780060760441 The Reading Group follows the tr...
368,9780061127762,61127760,Charlotte's Web Signature Edition,E. B. White,Juvenile Fiction,http://books.google.com/books/content?id=oi9BP...,This is the story of a little girl named Fern ...,2006.0,4.16,224.0,226.0,Charlotte's Web Signature Edition,9780061127762 This is the story of a little gi...
400,9780062700254,62700251,Bulfinch's Mythology,Richard P. Martin,Reference,http://books.google.com/books/content?id=eev4u...,A beautiful gift edition of Thomas Bulfinch's ...,1991.0,4.1,768.0,64.0,"Bulfinch's Mythology: The Age of the Fable, Th...",9780062700254 A beautiful gift edition of Thom...
673,9780140442755,140442758,Phaedrus,Plato,Philosophy,http://books.google.com/books/content?id=fNHF-...,"Set in the idyllic countryside outside Athens,...",1973.0,4.12,160.0,216.0,"Phaedrus: And, The Seventh and Eighth Letters",9780140442755 Set in the idyllic countryside o...
2927,9780618506910,618506918,Entre Amis,Michael Oates;Larbi Oukada,Foreign Language Study,http://books.google.com/books/content?id=M91wP...,Entre amis is a performance-oriented program d...,2005.0,3.52,528.0,21.0,Entre Amis: An Interactive Approach,9780618506910 Entre amis is a performance-orie...
3218,9780689869037,689869037,Invisible,Pete Hautman,Juvenile Fiction,http://books.google.com/books/content?id=-uZWP...,"You could say that my railroad, the Madham Lin...",2006.0,3.84,160.0,2269.0,Invisible,"9780689869037 You could say that my railroad, ..."
3348,9780743271325,743271327,Brokeback Mountain,Annie Proulx,Fiction,http://books.google.com/books/content?id=tO5Un...,The friendship between Ennis del Mar and Jack ...,2005.0,3.94,55.0,23424.0,Brokeback Mountain: Now a Major Motion Picture,9780743271325 The friendship between Ennis del...
3478,9780747574651,747574650,Old School,Tobias Wolff,Authors,http://books.google.com/books/content?id=qNrFw...,"It's 1960, in America, at a prestigious boys' ...",2005.0,3.82,195.0,7763.0,Old School,"9780747574651 It's 1960, in America, at a pres..."


In [11]:
books["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Human-animal relationships,1
475,Imperialism,1
476,Aged women,1
477,Humorous stories,1


In [10]:
books["categories"].value_counts().reset_index().query("count > 50")

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [20]:
books[books["categories"] == "Juvenile Fiction"]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006 Joe and his parents are enjoying...
79,9780020442608,0020442602,The voyage of the Dawn Treader,Clive Staples Lewis,Juvenile Fiction,http://books.google.com/books/content?id=fDD3C...,"The ""Dawn Treader"" is the first ship Narnia ha...",1970.0,4.09,216.0,2869.0,The voyage of the Dawn Treader,"9780020442608 The ""Dawn Treader"" is the first ..."
85,9780030547744,0030547741,Where the Red Fern Grows,Wilson Rawls,Juvenile Fiction,http://books.google.com/books/content?id=IHpRw...,A young boy living in the Ozarks achieves his ...,2000.0,4.37,288.0,95.0,Where the Red Fern Grows: The Story of Two Dog...,9780030547744 A young boy living in the Ozarks...
86,9780060000141,0060000147,Poppy's Return,Avi,Juvenile Fiction,http://books.google.com/books/content?id=XbcMJ...,"There's trouble at Gray House, the girlhood ho...",2006.0,3.99,256.0,1086.0,Poppy's Return,"9780060000141 There's trouble at Gray House, t..."
87,9780060001537,0060001534,Diary of a Spider,Doreen Cronin,Juvenile Fiction,http://books.google.com/books/content?id=UWvZo...,This is the diary ... of a spider. But don't b...,2005.0,4.25,40.0,7903.0,Diary of a Spider,9780060001537 This is the diary ... of a spide...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4846,9781590385814,1590385810,Fablehaven,Brandon Mull,Juvenile Fiction,http://books.google.com/books/content?id=tbVIP...,When Kendra and Seth go to stay at their grand...,2006.0,4.09,351.0,111896.0,Fablehaven,9781590385814 When Kendra and Seth go to stay ...
4931,9781596792500,1596792507,Sherlock Holmes and the Case of the Hound of t...,Arthur Conan Doyle;Malvina G. Vogel,Juvenile Fiction,http://books.google.com/books/content?id=EWgWP...,Sherlock Holmes and Dr. Watson travel to the b...,2005.0,4.51,237.0,28.0,Sherlock Holmes and the Case of the Hound of t...,9781596792500 Sherlock Holmes and Dr. Watson t...
4943,9781599900056,159990005X,The Drift House,Dale Peck,Juvenile Fiction,http://books.google.com/books/content?id=kbwPY...,Sent to stay with their uncle in a ship-like h...,2006.0,3.64,437.0,595.0,The Drift House: The First Voyage,9781599900056 Sent to stay with their uncle in...
5011,9781844580514,1844580512,Attack of the Jaguar,M. A. Harvey,Juvenile Fiction,http://books.google.com/books/content?id=3HUdt...,This training manual for operatives of Xtreme ...,2004.0,3.40,125.0,4.0,Attack of the Jaguar,9781844580514 This training manual for operati...


In [21]:
category_mapping = {
    "Fiction": "Fiction",
    "Juvenile Fiction": "Children's Fiction",
    "Biography & Autobiography": "Nonfiction",
    "History": "Nonfiction",
    "Literary Criticism": "Nonfiction",
    "Philosophy": "Nonfiction",
    "Religion": "Nonfiction",
    "Comics & Graphic Novels": "Fiction",
    "Drama": "Fiction",
    "Juvenile Nonfiction": "Childrens's Nonfiction",
    "Science": "Nonfiction",
    "Poetry": "Fiction"
}

In [22]:
books["simple_categories"] = books["categories"].map(category_mapping)

In [23]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5193,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...,
5194,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,
5195,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,
5196,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [24]:
books[~books["simple_categories"].isnull()]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079 Tricked once more by his wily ha...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006 Joe and his parents are enjoying...,Children's Fiction
46,9780007121014,0007121016,Taken at the Flood,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,Taken at the Flood,9780007121014 A Few Weeks After Marrying An At...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5179,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,Night Has a Thousand Eyes,"9781933648279 ""Cornell Woolrich's novels defin...",Fiction
5189,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969 Rescued from the lockers in whic...,Fiction
5190,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850 This book is the story of a youn...,Fiction
5196,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [None]:
# importing out text classfier using transformers
from transformers import pipeline
fiction_categories = ["Fiction", "Nonfiction"]
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]