In [1]:
import numpy as np
import pandas as pd

from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from datetime import datetime

from IPython.core.display import HTML
import tba3102



display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options(max_colwidth=300)

print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2023-04-01 20:27:16.602561


In [2]:
df = pd.read_csv('../data/57000-books-with-cleaned-blurbs.csv', index_col=0)
books_list = df['Title'].array
df

Unnamed: 0,ISBN,Title,Blurb,Cleaned_Blurb
0,0425176428,What If?: The World's Foremost Military Historians Imagine What Might Have Been,"Historians and inquisitive laymen alike love to ponder the dramatic what-its of history. In these twenty never-before-published essays, some of the keenest minds of our time ask the big, tantalizing questions:, Where might we be if history had not unfolded the way it did? , Why, how, and when wa...",historians inquisitive laymen alike love ponder dramatic whatits history twenty neverbeforepublished essays keenest minds time ask big tantalizing questions might history unfolded way fortune made real answers surprising sometimes frightening always entertaining
1,1841721522,New Vegetarian: Bold and Beautiful Recipes for Every Occasion,"Filled with fresh and eclectic recipes by Celia Brooks Brown -- one of the talented team of chefs at Books for Cooks, the world-famous bookshop-restaurant in London's Notting Hill -- New Vegetarian presents an innovative approach to vegetarian cooking. No longer the exclusive domain of vegetaria...",filled fresh eclectic recipes celia brooks brown one talented team chefs books cooks worldfamous bookshoprestaurant londons notting hill new vegetarian presents innovative approach vegetarian cooking longer exclusive domain vegetarians meatfree food appreciated bright assertive flavors marvelous...
2,0971880107,Wild Animus,"Newly graduated from college, Sam Altman is gripped by an inexplicable urge to lose himself in the wilderness and teams up with an enigmatic young woman who seems bent on helping him realize his dreams.",newly graduated college sam altman gripped inexplicable urge lose wilderness teams enigmatic young woman seems bent helping realize dreams
3,0345417623,Timeline,"In an Arizona desert, a man wanders in a daze, speaking words that make no sense. Within twenty-four hours he is dead, his body swiftly cremated by his only known associates. Halfway around the world, archaeologists make a shocking discovery at a medieval site. Suddenly they are swept off to the...",arizona desert man wanders daze speaking words make sense within twentyfour hours dead body swiftly cremated known associates halfway around world archaeologists make shocking discovery medieval site suddenly swept headquarters secretive multinational corporation developed astounding technology ...
4,0375759778,Prague : A Novel,"A novel of startling scope and ambition, , depicts an intentionally lost Lost Generation as it follows five American expats who come to Budapest in the early 1990s to seek their fortune. They harbor the vague suspicion that their counterparts in Prague have it better, but still they hope to find...",novel startling scope ambition depicts intentionally lost lost generation follows five american expats come budapest early 1990s seek fortune harbor vague suspicion counterparts prague better still hope find adventure inspiration gold rush history making
...,...,...,...,...
18154,0312749511,The Space Merchants,"In a vastly overpopulated near-future world, businesses have taken the place of governments and now hold all political power. States exist merely to ensure the survival of huge transnational corporations. Advertising has become hugely aggressive and boasts some of the world’s most powerful execu...",vastly overpopulated nearfuture world businesses taken place governments hold political power states exist merely ensure survival huge transnational corporations advertising become hugely aggressive boasts worlds powerful executivesthrough advertising public constantly deluded thinking products ...
18155,0671318470,Agent of Vega &amp; Other Stories,"Four years after dropping out of Harlem society, David McKay, a handsome young lawyer from a prominent Strivers' Row family, returns home, devastated by the news of his sister Lilian's suicide. What caused his once stable, gentle sister to take her own life? Why did she marry Jameson Sweet, givi...",four years dropping harlem society david mckay handsome young lawyer prominent strivers row family returns home devastated news sister lilians suicide caused stable gentle sister take life marry jameson sweet giving man barely knew claim family home caused flamboyant twin gem return harlem paris...
18156,0451458877,Tainted Trail,"Ukiah Oregon, half-man and half-alien raised by wolves, is hired to track a missing hiker. When her trail grows cold, Ukiah discovers the woman has actually been kidnapped. He then stumbles upon another mystery related to the unsolved disappearance in 1933 of a legendary wolf boy. The hiker's ab...",ukiah oregon halfman halfalien raised wolves hired track missing hiker trail grows cold ukiah discovers woman actually kidnapped stumbles upon another mystery related unsolved disappearance 1933 legendary wolf boy hikers abduction may hold key ukiahs past
18157,0399148736,Twelve Mile Limit,"On a Friday in early November, four people head out from the west coast of Florida to dive a deep-water wreck fifty-six miles offshore. Two days later, one of them is found alive, standing atop a 160-foot light tower in the Gulf of Mexico, naked and waving her wetsuit. But the other three appear...",friday early november four people head west coast florida dive deepwater wreck fiftysix miles offshore two days later one found alive standing atop 160foot light tower gulf mexico naked waving wetsuit three appear swept edge earth one diver fords friend janet mueller rumors beginwhispers everyth...


In [3]:
norm_corpus = df['Cleaned_Blurb'].array
print('Number of documents/books in corpus: ', len(norm_corpus))

Number of documents/books in corpus:  18159


In [4]:
# train our doc2vec model
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(norm_corpus)]

model = Doc2Vec(vector_size=100, window=10, min_count=5, workers=11, alpha=0.025, epochs=100)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
model.save('../data/model_docsimilarity.doc2vec')
model

<gensim.models.doc2vec.Doc2Vec at 0x2313698b2b0>

In [5]:
# test our model with 10 random documents/books
random_books = df.sample(n=10, replace=False, random_state=np.random.randint(2**31-1))
random_books_titles = random_books.Title.tolist()

for random_book_title in random_books_titles:
    
    book_index = np.where(books_list == random_book_title)[0][0]
    print('Book ID:', book_index)
    print('Book:', df.loc[book_index, 'Title'])
    print('Blurb:', df.loc[book_index, 'Blurb'])
    print('Cleaned Blurb:', documents[book_index])
    print('-' * 80)
    
    doc_words = documents[book_index].words.split()
    doc_vec = model.infer_vector(doc_words, epochs=10000)
    results = model.dv.most_similar(positive=[doc_vec], topn=5)

    print('Top 5 recommended Books:')

    for result in results:

        print(result[0], result[1], df.loc[result[0], 'Title'])
        
    print('=' * 80)

Book ID: 13147
Book: Armageddon (Left Behind #11)
Blurb: The scattered Tribulation Force is drawn inexorably toward the Middle East, as are all the armies of the world, when human history culminates in the battle of the ages. During the last year of the Great Tribulation, safe houses are no longer safe, and the cast of characters dramatically changes. By the time of the war of the great day of God the Almighty, the globe has become a powder keg of danger. Except those already in Petra, everyone has been forced to relocate as Antichrist ratchets up the pressure in the world's most treacherous game. Who will be left standing when the battle leaves the Tribulation Force on the brink of the end of time and the Glorious Appearing?
Cleaned Blurb: TaggedDocument(scattered tribulation force drawn inexorably toward middle east armies world human history culminates battle ages last year great tribulation safe houses longer safe cast characters dramatically changes time war great day god almighty

In [6]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2023-04-01 20:31:10.204054
