In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from random import randrange

# Preprocess and Feature Engineering

In [None]:
import string
from string import digits
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords
import re
from nltk.stem.porter import PorterStemmer

def clean_name(txt):

    txt = str(txt)
    # lowercase
    txt = txt.lower()
    # remove digits
    txt = txt.translate(str.maketrans('', '', digits))
    # HTML tags
    txt = re.sub(r'<.*?>', '', txt)
    # replace "/" with space
    txt = re.sub("/", " ", txt)
    # remove quotation marks
    txt = re.sub("[\"\']", "", txt)
    # remove punctuation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    # remove stopwords
    txt = ' '.join([word for word in txt.split(' ') if word not in stopwords])
    # remove duplicate whitespaces
    txt = re.sub(r'\s+', ' ', txt)
    return txt

def clean_tag(txt):

    txt = str(txt)
    # lowercase
    txt = txt.lower()
    # replace hyphens with space
    txt = re.sub("-", " ", txt)
    # HTML tags
    txt = re.sub(r'<.*?>', '', txt)
    # replace "/" with space
    txt = re.sub("/", " ", txt)
    # remove quotation marks
    txt = re.sub("[\"\']", "", txt)
    # remove punctuation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    # remove stopwords
    txt = ' '.join([word for word in txt.split(' ') if word not in stopwords])
    # stemming
    stemmer = PorterStemmer()
    txt = ' '.join([stemmer.stem(word) for word in txt.split()])
    # remove duplicate whitespaces
    txt = re.sub(r'\s+', ' ', txt)
    return txt

In [None]:
folder = "/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/"

# load datasets
books = pd.read_csv(folder + "books_cleaned.csv", sep=",", error_bad_lines=False, engine='python')
tags = pd.read_csv(folder + "tags.csv", sep=",", error_bad_lines=False, engine='python')
book_tags = pd.read_csv(folder + "book_tags.csv", sep=",", error_bad_lines=False, engine='python')
# rename the column name in book tags dataset to make it consistent to the other datasets
book_tags = book_tags.rename(columns={"goodreads_book_id": "book_id"})

# take the important features from the books dataset
df_train = books[['book_id','authors','original_publication_year','language_code', 'original_title']] # kept original title for evaluation purposes

# join the book tags to the training dataframe
df_train = df_train.join(book_tags.set_index('book_id'), on='book_id')
df_train = df_train.join(tags.set_index('tag_id'), on='tag_id')

# preprocess the text features before applying NLP
df_train['cleaned_title'] = df_train['original_title'].apply(clean_name)
df_train['cleaned_authors'] = df_train['authors'].apply(clean_name)
df_train['cleaned_tag'] = df_train['tag_name'].apply(clean_tag)

# rollup multiple rows, concatenating the tag names
df_train = df_train[['book_id','cleaned_title', 'cleaned_authors','original_publication_year','language_code', 'cleaned_tag', 'original_title']]
df_train = df_train.groupby(['book_id','cleaned_title', 'cleaned_authors','original_publication_year','language_code', 'original_title'])['cleaned_tag'].apply(', '.join).reset_index()




  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df_train.head()

Unnamed: 0,book_id,cleaned_title,cleaned_authors,original_publication_year,language_code,original_title,cleaned_tag
0,1,harry potter halfblood prince,jk rowling mary grandpré,2005.0,eng,Harry Potter and the Half-Blood Prince,"read, fantasi, favorit, current read, young ad..."
1,2,harry potter order phoenix,jk rowling mary grandpré,2003.0,eng,Harry Potter and the Order of the Phoenix,"read, current read, fantasi, favorit, children..."
2,3,harry potter philosophers stone,jk rowling mary grandpré,1997.0,eng,Harry Potter and the Philosopher's Stone,"read, favorit, fantasi, current read, young ad..."
3,5,harry potter prisoner azkaban,jk rowling mary grandpré rufus beck,1999.0,eng,Harry Potter and the Prisoner of Azkaban,"favorit, fantasi, current read, young adult, r..."
4,6,harry potter goblet,jk rowling mary grandpré,2000.0,eng,Harry Potter and the Goblet of Fire,"fantasi, young adult, fiction, harri potter, o..."


In [None]:
df_train.to_csv('content_based_filtering_df.csv', index=False)

# Content Based Filtering - Generate Matrices

In [None]:
# combine the text features
def concat_texts(df):
    return ''.join(str(df['cleaned_title']) + ' | ' + str(df['cleaned_authors']) + ' | ' 
                   + str(int(df['original_publication_year'])) + ' | ' + str(df['language_code']) + ' | ' + str(df['cleaned_tag']))

df_train['combined_features'] = df_train.apply(concat_texts, axis=1)

# use count and tfidf vectorizers, and compare how the models perform
count_vec = CountVectorizer().fit_transform(df_train['combined_features'])
tfidf_vec = TfidfVectorizer().fit_transform(df_train['combined_features'])

# get the similarity matrices, with combination of vectorizers (count vectorizer, TF-IDF vectorizer) and kernels (cosine, linear)
cs_count_vec = cosine_similarity(count_vec, count_vec)
lk_count_vec = linear_kernel(count_vec, count_vec)
cs_tfidf_vec = cosine_similarity(tfidf_vec, tfidf_vec)
lk_tfidf_vec = linear_kernel(tfidf_vec, tfidf_vec)

# include indexing for quick query and drop duplicates
indices = pd.Series(df_train.index, index=df_train['book_id']).drop_duplicates()

In [None]:
import pickle
# save to pickle
with open('cs_count_vec.pickle', 'wb') as handle:
  pickle.dump(cs_count_vec, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('lk_count_vec.pickle', 'wb') as handle:
  pickle.dump(lk_count_vec, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('cs_tfidf_vec.pickle', 'wb') as handle:
  pickle.dump(cs_tfidf_vec, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('lk_tfidf_vec.pickle', 'wb') as handle:
  pickle.dump(lk_tfidf_vec, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('book_indices.pickle', 'wb') as handle:
  pickle.dump(indices, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Generate Recommendation

Run from here

In [None]:
import numpy as np 
import pandas as pd 

from random import randrange

import pickle

In [None]:
# load saved files

folder = "/content/drive/MyDrive/COMP9900 Project/Machine Learning/Model Files/"

with open(folder + 'cs_count_vec.pickle', 'rb') as handle:
  cs_count_vec = pickle.load(handle)

with open(folder + 'lk_count_vec.pickle', 'rb') as handle:
  lk_count_vec = pickle.load(handle)

with open(folder + 'cs_tfidf_vec.pickle', 'rb') as handle:
  cs_tfidf_vec = pickle.load(handle)

with open(folder + 'lk_tfidf_vec.pickle', 'rb') as handle:
  lk_tfidf_vec = pickle.load(handle)

with open(folder + 'book_indices.pickle', 'rb') as handle:
  indices = pickle.load(handle)

In [None]:
def get_content_based_recommendations(book_id, sim_matrix, num_recommendation = 10, variation_size = 0.2):
    # get the index of the book that matches the book_id
    idx = indices[book_id]

    # calculate how similar the book to the other books and sort based on the similarity score
    similarity_scores = list(enumerate(sim_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda el: el[1], reverse=True)

    # get the scores of the 10 most similar books
    recommendation_idx = []
    num_of_variation = int(num_recommendation * variation_size)
    if (num_recommendation > 5):
      for i in range(1, 1 + num_recommendation - num_of_variation):
        recommendation_idx.append(i)
      for i in range(num_of_variation):
        var_idx = randrange(num_of_variation * 5)
        while ((1 + num_recommendation - num_of_variation + var_idx) in recommendation_idx):
          var_idx = randrange(num_of_variation * 4)
        recommendation_idx.append((1 + num_recommendation - num_of_variation + var_idx))

    similarity_scores = [similarity_scores[i] for i in recommendation_idx]
    
    # get the book indices
    book_idx = [i[0] for i in similarity_scores]

    # return the top n most similar books + some variations
    return df_train['original_title'].iloc[book_idx]

In [None]:
# using count vectorizer + cosine similarity
get_content_based_recommendations(3, cs_count_vec, 10, 0.2)

[1, 2, 3, 4, 5, 6, 7, 8, 16, 17]


1076      Harry Potter and the Chamber of Secrets
3        Harry Potter and the Prisoner of Azkaban
4             Harry Potter and the Goblet of Fire
0          Harry Potter and the Half-Blood Prince
3307         Harry Potter and the Deathly Hallows
1       Harry Potter and the Order of the Phoenix
464                          Howl's Moving Castle
5299                 The Tales of Beedle the Bard
4176       Gregor and the Curse of the Warmbloods
2014      Fantastic Beasts and Where to Find Them
Name: original_title, dtype: object

In [None]:
# using count vectorizer + linear kernel
get_content_based_recommendations(3, lk_count_vec, 10, 0.2)

[1, 2, 3, 4, 5, 6, 7, 8, 10, 18]


4179                                     Just for You
4761                              Bedtime for Frances
3804                          Just a Mess (Look-Look)
3805                   Just Me and My Dad (Look-Look)
3895                            A Pocket for Corduroy
4149       Lyle, Lyle, Crocodile (Lyle the Crocodile)
4696    The Saggy Baggy Elephant (Little Golden Book)
4246                Yertle the Turtle (Classic Seuss)
1188                               The Enchanted Wood
2483                        The Five Chinese Brothers
Name: original_title, dtype: object

In [None]:
# using TF-IDF Vectorizer + cosine similarity
get_content_based_recommendations(3, cs_tfidf_vec, 10, 0.2)

[1, 2, 3, 4, 5, 6, 7, 8, 18, 16]


1076         Harry Potter and the Chamber of Secrets
4                Harry Potter and the Goblet of Fire
0             Harry Potter and the Half-Blood Prince
3307            Harry Potter and the Deathly Hallows
3           Harry Potter and the Prisoner of Azkaban
1          Harry Potter and the Order of the Phoenix
5       Harry Potter Collection (Harry Potter, #1-6)
4736                 Complete Harry Potter Boxed Set
2486                           The magician's nephew
1444                               The Book of Three
Name: original_title, dtype: object

In [None]:
# using TF-IDF Vectorizer + linear kernel
get_content_based_recommendations(3, lk_tfidf_vec, 10, 0.2)

[1, 2, 3, 4, 5, 6, 7, 8, 14, 11]


1076              Harry Potter and the Chamber of Secrets
4                     Harry Potter and the Goblet of Fire
0                  Harry Potter and the Half-Blood Prince
3307                 Harry Potter and the Deathly Hallows
3                Harry Potter and the Prisoner of Azkaban
1               Harry Potter and the Order of the Phoenix
5            Harry Potter Collection (Harry Potter, #1-6)
4736                      Complete Harry Potter Boxed Set
4336    The Magical Worlds of Harry Potter: A Treasury...
3120                           Quidditch Through the Ages
Name: original_title, dtype: object