## Content based recommendation

In [87]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from joblib import load
from joblib import dump

import spacy

#### 1. Data preprocessing and cleaning

In [88]:
df = pd.read_csv("../data/goodreads_data.csv")

In [89]:
# df.head()

In [90]:
df.duplicated().sum()

0

In [91]:
df.isnull().sum()

Unnamed: 0      0
Book            0
Author          0
Description    77
Genres          0
Avg_Rating      0
Num_Ratings     0
URL             0
dtype: int64

In [92]:
# 77 descriptions are nan
# Descriptions of the books are key components for the recommendation sys

df = df.dropna(axis=0)
df.isnull().sum()

Unnamed: 0     0
Book           0
Author         0
Description    0
Genres         0
Avg_Rating     0
Num_Ratings    0
URL            0
dtype: int64

In [93]:
df["Genres"].dtype

dtype('O')

In [94]:
df["Genres"][0]

"['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical']"

In [95]:
df["Genres"] = df["Genres"].str.replace("[", "").str.replace("]", "").str.replace("'", "")

In [96]:
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"Fantasy, Fiction, Young Adult, Magic, Children...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...
2,2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","Classics, Fiction, Romance, Historical Fiction...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...
3,3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"Classics, Nonfiction, History, Biography, Memo...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...
4,4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"Classics, Fiction, Dystopia, Fantasy, Politics...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...


In [97]:
df = df.drop(labels="Unnamed: 0", axis=1)

In [98]:
df.head()

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"Fantasy, Fiction, Young Adult, Magic, Children...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","Classics, Fiction, Romance, Historical Fiction...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"Classics, Nonfiction, History, Biography, Memo...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"Classics, Fiction, Dystopia, Fantasy, Politics...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...


In [99]:
df.dtypes

Book            object
Author          object
Description     object
Genres          object
Avg_Rating     float64
Num_Ratings     object
URL             object
dtype: object

In [100]:
df["Num_Ratings"].dtype

dtype('O')

In [101]:
def format_ratings(rating_str):
    rating_str = rating_str.replace(",","")
    return int(rating_str)

df["Num_Ratings"]  = df["Num_Ratings"].apply(format_ratings)

In [102]:
df["Num_Ratings"].dtype

dtype('int64')

In [103]:
df.describe()

Unnamed: 0,Avg_Rating,Num_Ratings
count,9923.0,9923.0
mean,4.067502,93772.06
std,0.331937,343376.6
min,0.0,0.0
25%,3.88,561.5
50%,4.07,16170.0
75%,4.26,65226.5
max,5.0,9278135.0


In [104]:
df[df["Num_Ratings"]<1]

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
3747,Broken: The Failed Promise of Muslim Inclusion,Evelyn Alsultany,PROSE Award- Media and Cultural Studies Finali...,,0.0,0,https://www.goodreads.com/book/show/75268277-b...
6184,The Unknown She,Brooksie D. Thompson,"In Brooksie D. Thompson's The Unknown She, poe...",,0.0,0,https://www.goodreads.com/book/show/27640441-t...
7270,What Have We Done?,Zoe Moor,"“Your decisions, no matter how right or wrong,...",,0.0,0,https://www.goodreads.com/book/show/32708750-w...
9720,"I,Legal in the U.S.A. - a memoir",Alejandra Campos,Autobiographical account of a young Hispanic w...,,0.0,0,https://www.goodreads.com/book/show/18462053-i...
9793,About Love and Joy of Life: The Struggle for S...,Franz Josef Kaps,A touching and powerful story of love describi...,,0.0,0,https://www.goodreads.com/book/show/19127889-a...
9861,The Sense of a Deity,Cale Rainer,**Out of Print**Children have an intimate rela...,,0.0,0,https://www.goodreads.com/book/show/18128366-t...
9945,Ballochmyle,DIEL,A group of social outcasts have been forced to...,,0.0,0,https://www.goodreads.com/book/show/21000600-b...
9986,The Vision of the Evening and the Morning,John Magallan Lopez,This book is about the corrilation which exist...,,0.0,0,https://www.goodreads.com/book/show/22065143-t...


In [105]:
df = df.drop(labels=df[df["Num_Ratings"]<1].index, axis=0)
len(df[df["Num_Ratings"]<1])

0

In [106]:
df[df["Avg_Rating"]<1]

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL


In [107]:
df.describe()

Unnamed: 0,Avg_Rating,Num_Ratings
count,9915.0,9915.0
mean,4.070784,93847.72
std,0.311303,343504.8
min,1.64,1.0
25%,3.88,570.5
50%,4.07,16211.0
75%,4.26,65273.5
max,5.0,9278135.0


In [109]:
en_model = spacy.load("en_core_web_sm")

In [115]:
# Function to preprocess the description column
def process_description(desc, nlp_model=en_model):
    doc = nlp_model(desc)
    filtered = []
    for token in doc:
        if (not token.is_stop) and (not token.is_punct):
            filtered.append(token.lemma_)

    return " ".join(filtered)

# Testing this function
process_description(desc="Hi! My gaming name is Scythe! I won't tell you my real name")

'hi gaming Scythe will tell real'

In [116]:
# Preprocessing the descriptions of books
df["processed_desc"] = df["Description"].apply(process_description)

In [117]:
df.head()

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL,processed_desc
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...,unforgettable novel childhood sleepy southern ...
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"Fantasy, Fiction, Young Adult, Magic, Children...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...,Harry Potter think ordinary boy rescue owl tak...
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","Classics, Fiction, Romance, Historical Fiction...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...,immediate success 1813 Pride Prejudice remain ...
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"Classics, Nonfiction, History, Biography, Memo...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...,discover attic spend year life Anne Frank rema...
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"Classics, Fiction, Dystopia, Fantasy, Politics...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...,Librarian note Alternate Cover Edition edition...


In [118]:
df.to_csv("../data/processed.csv", index=False)

In [134]:
df = pd.read_csv("../data/processed.csv")
df.isnull().sum()

Book                0
Author              0
Description         0
Genres            915
Avg_Rating          0
Num_Ratings         0
URL                 0
processed_desc      1
dtype: int64

In [None]:
# For some reason we got some null values
# But these aren't that many
# (For some reason I am not able to find the issue, let's just drop these rows)
df = df.dropna(axis=0)

In [136]:
df.isnull().sum()

Book              0
Author            0
Description       0
Genres            0
Avg_Rating        0
Num_Ratings       0
URL               0
processed_desc    0
dtype: int64

In [137]:
df.to_csv("../data/processed.csv", index=False)

#### 2. Creating vocabulary for descriptions of books

In [139]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from joblib import load
from joblib import dump

import spacy

In [140]:
df = pd.read_csv("../data/processed.csv")

In [142]:
vec = CountVectorizer()
p_desc = df["processed_desc"]
vec.fit(p_desc)

p_desc_vec = vec.transform(p_desc)
p_desc_vec

<8999x57878 sparse matrix of type '<class 'numpy.int64'>'
	with 632826 stored elements in Compressed Sparse Row format>

In [144]:
# This is what the bag of words of the vocabulary looks like
vec.get_feature_names_out()[2000:2050]

array(['albertine', 'alberto', 'albie', 'albino', 'albinoni', 'albinus',
       'albolabris', 'albom', 'albrand', 'album', 'albuquerque', 'albus',
       'alcaide', 'alcan', 'alcance', 'alcanza', 'alcança', 'alcasan',
       'alcatraz', 'alceste', 'alchemical', 'alchemist', 'alchemists',
       'alchemy', 'alchemyinspired', 'alchemyst', 'alcibiadas', 'alcina',
       'alcoa', 'alcohol', 'alcoholic', 'alcoholics', 'alcoholism',
       'alcohólico', 'alcorn', 'alcott', 'alcunha', 'aldbourne', 'alden',
       'aldens', 'alderley', 'alderman', 'alderson', 'aldiss', 'aldous',
       'aldrich', 'aldridge', 'aldrig', 'aldur', 'aldırmadan'],
      dtype=object)

#### 3. Using the concept of cosine similarity for a recommendation system

In [145]:
# We have the sparse matrix
# Let's calculate the cosine sim of each vec with other vec

all_sim = cosine_similarity(p_desc_vec)

In [None]:
# This is similarity of first  with every other description
all_sim[0] 

array([1.        , 0.02152654, 0.06755523, ..., 0.09928551, 0.06942101,
       0.03958844])

In [162]:
sim_index = []
threshold = 0.2
for ind, sim in enumerate(all_sim[0]):
    if sim > threshold:
        sim_index.append(ind)

sim_index[1:7]

[52, 142, 439, 451, 896, 1420]

In [163]:
df["Description"][0]

'The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee always considered her book to be a simple love story. Today it is regarded as a masterpiece of American literature.'

In [164]:
df["Description"][1420]

'From Harper Lee comes a landmark new novel set two decades after her beloved Pulitzer Prize-winning masterpiece, To Kill a Mockingbird. Maycomb, Alabama. Twenty-six-year-old Jean Louise Finch—"Scout"—returns home from New York City to visit her aging father, Atticus. Set against the backdrop of the civil rights tensions and political turmoil that were transforming the South, Jean Louise\'s homecoming turns bittersweet when she learns disturbing truths about her close-knit family, the town and the people dearest to her. Memories from her childhood flood back, and her values and assumptions are thrown into doubt. Featuring many of the iconic characters from To Kill a Mockingbird, Go Set a Watchman perfectly captures a young woman, and a world, in a painful yet necessary transition out of the illusions of the past—a journey that can be guided only by one\'s conscience. Written in the mid-1950s, Go Set a Watchman imparts a fuller, richer understanding and appreciation of Harper Lee. Here 

**Both the books are about Americal literature or culture.**

In [214]:
# Copy paste of same function we made for preprocessing
def process_description(desc, nlp_model=en_model):
    doc = nlp_model(desc)
    filtered = []
    for token in doc:
        if (not token.is_stop) and (not token.is_punct):
            filtered.append(token.lemma_)

    return " ".join(filtered)


# Function for matching our description with the description present in data

def similar_description(desc, nlp_model=en_model, 
                        vectorizer=vec, desc_sparse=p_desc_vec):
    
    matched_inds = []
    thresh_range = [0.5, 0.4, 0.3, 0.2, 0.1]
    for thresh in thresh_range:

        ptext = process_description(desc=desc, nlp_model=nlp_model)
        ptext_vec = vectorizer.transform([ptext])

        # print(f"Searching in thresh range - [{thresh} - 1.00]")

        for ind, desc_sp in enumerate(desc_sparse):
            csim = cosine_similarity(ptext_vec, desc_sp)
            if csim[0][0] >= thresh and csim < 1.0:
                matched_inds.append(ind)

        if len(matched_inds) != 0:
            return matched_inds

In [215]:
similar_description(desc="Wars of the acient times fought very fiercely")

[1198, 3717, 5277, 5834, 6409, 8301]

In [216]:
def collect_data(matched_inds, df=df):
    res_dict = {}
    for ind in matched_inds:
        res_dict[df["Book"].iloc[ind]] = {
            "description":df["Description"].iloc[ind],
            "author":df["Author"].iloc[ind],
            "genres":df["Genres"].iloc[ind],
            "avg_rating":df["Avg_Rating"].iloc[ind],
            "url":df["URL"][ind]
        }

    return res_dict

In [217]:
res = collect_data(matched_inds=[1198, 3717, 5277, 5834, 6409, 8301])
# res

first_key = list(res.keys())[0]
res[first_key]

{'description': 'In 1936 George Orwell travelled to Spain to report on the Civil War and instead joined the fight against the Fascists. This famous account describes the war and Orwell’s own experiences. Introduction by Lionel Trilling.',
 'author': 'George Orwell',
 'genres': 'History, Nonfiction, Politics, Classics, War, Memoir, Biography',
 'avg_rating': 4.09,
 'url': 'https://www.goodreads.com/book/show/9646.Homage_to_Catalonia'}

In [212]:
# Let's save some important components that we will need later
dump(value=vec, filename="../saved_components/vectorizer.pickle")
dump(value=p_desc_vec, filename="../saved_components/pdesc_sparse.pickle")
dump(value=en_model, filename="../saved_components/en_model_sm.pickle")
dump(value=all_sim, filename="../saved_components/all_sim.pickle")

['../saved_components/all_sim.pickle']

#### 4. Loading the components and using them

In [227]:
# Loading the components
loaded_vec = load(filename="../saved_components/vectorizer.pickle")
loaded_sparse = load(filename="../saved_components/pdesc_sparse.pickle")
loaded_nlp_en = load(filename="../saved_components/en_model_sm.pickle")

In [218]:
# Copy paste of same function we made for preprocessing and for recommendation
def process_description(desc, nlp_model=loaded_nlp_en):
    doc = nlp_model(desc)
    filtered = []
    for token in doc:
        if (not token.is_stop) and (not token.is_punct):
            filtered.append(token.lemma_)

    return " ".join(filtered)

def similar_description(desc, nlp_model=loaded_nlp_en, 
                        vectorizer=loaded_vec, desc_sparse=loaded_sparse):
    
    matched_inds = []
    thresh_range = [0.5, 0.4, 0.3, 0.2, 0.1]
    for thresh in thresh_range:

        ptext = process_description(desc=desc, nlp_model=nlp_model)
        ptext_vec = vectorizer.transform([ptext])

        print(f"Searching in thresh range - [{thresh} - 1.00]")

        for ind, desc_sp in enumerate(desc_sparse):
            csim = cosine_similarity(ptext_vec, desc_sp)
            if csim[0][0] >= thresh and csim < 1.0:
                matched_inds.append(ind)

        if len(matched_inds) != 0:
            return matched_inds

In [223]:
# Testing the function with loaded components
similar_description(desc="Samurais fighting with each other in war to take revenge")

Searching in thresh range - [0.5 - 1.00]
Searching in thresh range - [0.4 - 1.00]
Searching in thresh range - [0.3 - 1.00]


[593, 971, 1198, 1665, 3717, 4103, 4761, 5834, 6409, 6617, 8301, 8397]

In [224]:
def collect_data(matched_inds, df=df):
    res_dict = {}
    for ind in matched_inds:
        res_dict[df["Book"].iloc[ind]] = {
            "description":df["Description"].iloc[ind],
            "author":df["Author"].iloc[ind],
            "genres":df["Genres"].iloc[ind],
            "avg_rating":df["Avg_Rating"].iloc[ind],
            "url":df["URL"][ind]
        }

    return res_dict