In [0]:
# Using pandarallel help parallelize Dataframe function
# Divide processing load into multiple cores
!pip install requests pandarallel

In [14]:
# Import libraries
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandarallel import pandarallel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Use 8 cores to parallelize processing
pandarallel.initialize(nb_workers=8, use_memory_fs=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# Step 01: Read Data
df = pd.read_excel("/content/drive/My Drive/EHTP/AI/ML/project/books.xlsx", dtype={"isbn":np.str}).reset_index(drop=True)
print(df.shape)
df.head()

(94528, 8)


Unnamed: 0,isbn,title,publisher,pages,raters,avg_ratings,categories,authors
0,0001360469,Babe Dressing,Harpercollins Publishers,10.0,1,5,Babe (Fictitious Character),Mandy Stanley
1,0001374362,When It'S Time For Bed,Collins,14.0,1,5,Animals,Nick Butterworth
2,0001714236,Spooky Riddles,Harpercollins Publishers,42.0,1,0,Readers,Marc Brown
3,0001821504,The Adventures Of Paddington,Harpercollins Publishers,253.0,1,0,Bears,Michael Bond
4,000184251X,February'S Road,Harpercollins Publishers,192.0,2,0,Children'S Stories | English,John Verney


In [10]:
# Calculate the minimum number of votes required to be in the chart
rate_treshold = df['raters'].quantile(0.70)
q_books = df.copy().loc[df['raters'] >= rate_treshold]
q_books.shape

(31303, 8)

In [0]:
# Features considered in content filtering
features = ["publisher", "categories", "authors"]
q_books["features"] = ""

In [12]:
# join all features in one column
q_books["features"] = q_books[features].parallel_apply(lambda row: " ".join(row.dropna()).replace("|", ""), axis=1).fillna("")

# Delete unecessary features
q_books.drop(["publisher", "pages", "raters", "avg_ratings", "categories", "authors"], axis=1, inplace=True)
q_books.head()

Unnamed: 0,isbn,title,features
29,0002251760,The Forgetting Room,Harpercollins Fiction Nick Bantock
39,0002256886,Araby,Harpercollins Publishers Domestic Fiction Gret...
43,0002258366,Prospero'S Children,Voyager Atlantis (Legendary Place) Jan Siegel
44,000225851X,Godless In Eden,Flamingo Essays Fay Weldon
45,0002258560,Is Shane Macgowan Still Alive?,Flamingo Humor Tim Bradford


In [0]:
# Define words to ignore during CountVectorizer
stopwords_list = stopwords.words('english') + stopwords.words('french')

In [16]:
#Define a count Vectorizer Object. Remove all english/french stop words such as 'the', 'a', 'et'
count = CountVectorizer(stop_words=stopwords_list)


#Construct the required TF-IDF matrix by fitting and transforming the data
count_matrix = count.fit_transform(q_books['features'])

#Output the shape of tfidf_matrix
count_matrix.shape

(31303, 15255)

In [0]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [18]:
df.sample()

Unnamed: 0,isbn,title,publisher,pages,raters,avg_ratings,categories,authors
34217,451401948,Slice,New Amer Library (Mm),317.0,2,0,Fiction,Rex Miller
