# Imports

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import gensim.downloader as api

# Data Loading

In [2]:
## Load cleaned data
X_reviews = pd.read_csv("/Users/egmac/code/arostagnat/BookMatch/data/proc_data/cluster_result/X_bert_cluster_150.csv")

In [3]:
## Load metadata
metadata_movies = pd.read_json("/Users/egmac/code/arostagnat/BookMatch/data/raw_data/raw_movies/metadata.json", lines=True)
metadata_books = pd.read_json("/Users/egmac/code/arostagnat/BookMatch/data/raw_data/raw_book/metadata.json", lines=True)

In [4]:
# Adjust metadata columns to match X_reviews
metadata_movies.rename({"item_id":"item_id_movie", "title":"title_movie"}, axis='columns',inplace=True)
metadata_books.rename({"item_id":"item_id_book", "title":"title_book","img":"img_book","url":"url_book"}, axis='columns',inplace=True)

In [5]:
## Adjust import to replace $$$ with 0, and convert item_id to float to enable merge. 
## Note that the X_reviews import is preformatted as a float
# X_reviews = X_reviews.replace({'$$$': 0}, regex=False)
X_reviews.item_id_movie = X_reviews.item_id_movie.astype(float)
X_reviews.item_id_book = X_reviews.item_id_book.astype(float)

In [7]:
## Revise metadata item_id to float to match X_reviews
metadata_movies.item_id_movie = metadata_movies.item_id_movie.astype(float)
metadata_books.item_id_book = metadata_books.item_id_book.astype(float)

In [8]:
## Merge X_reviews and metadata
X_all = pd.merge(X_reviews, metadata_movies[["title_movie","item_id_movie"]], on="item_id_movie", how="left")
X_all = pd.merge(X_all, metadata_books[["title_book","item_id_book","url_book","img_book"]], on="item_id_book", how="left")

In [9]:
## Check import
X_all.head(1)

Unnamed: 0,item_id_movie,is_movie,item_id_book,clustering_label_bert,vector,title_movie,title_book,url_book,img_book
0,132692.0,1.0,-1.0,0,[-1.79571323e-02 3.01178787e-02 -2.63748504e-...,Frontier Rangers (1959),,,


In [10]:
X_all.shape

(27532, 9)

In [11]:
## Import book ratings for recommendation filtering
ratings_books =  pd.read_json("/Users/egmac/code/arostagnat/BookMatch/data/raw_data/raw_book/ratings.json", lines=True)
ratings_books = ratings_books.rename(columns={"item_id":"item_id_book"})

In [12]:
## Calculate average rating for each book
avg_ratings_books = ratings_books.groupby(["item_id_book"]).mean().drop(columns=["user_id"])
print(f"Original ratings df: {ratings_books.shape} | Average ratings df: {avg_ratings_books.shape}")

Original ratings df: (5152656, 3) | Average ratings df: (9374, 1)


# Post-processing

In [13]:
## Import relevant packages

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
## Create extract of X_all with vectors spread across columns, and confirm relevance of dimensions
## Vectors have to be reformatted as lists, as they are formatted as strings with ""

vectors = X_all.vector.tolist()
vectors_revised = []

for vector in vectors:
    result = vector.strip('[]').replace("'","").replace("\n","").split()
    result = [float(i) for i in result]
    vectors_revised.append(result)

X_vectors = pd.DataFrame(vectors_revised)

In [15]:
# svd = TruncatedSVD(n_components=X_vectors.shape[1])
# svd_result = svd.fit_transform(X_vectors)

In [16]:
## Plot variance as a function of the number of components.
## Based on the below figure, nearly 100% of the variance is explained by 250 components
# plt.plot(svd.explained_variance_ratio_.cumsum())
# plt.xlabel('Number of singular value components')
# plt.ylabel('Cumulative percent of variance')   
# plt.grid()
# plt.show()

In [17]:
## Reshape vectors to 250 components, which will help reduce computational time
# n = 250
# X_vectors_revised = pd.DataFrame(X_vectors.iloc[:,0:n])
# print(f'X_all shape: {X_all.shape} | X_vectors shape: {X_vectors.shape}')

In [18]:
## Add qualitative columns to X_vectors_revised, and then create 2 separate dataframes for books and movies.
## Movie dataframe to be used to lookup user-inputted movies. Book dataframe to be used for calculations.
## Note that the dataframes need to be separated eventually, so it's worth doing now.

X_vectors[["item_id_movie","item_id_book","is_movie","clustering_label_bert"]] = X_all[["item_id_movie","item_id_book","is_movie","clustering_label_bert"]]
X_vectors_movies = X_vectors[X_vectors.is_movie == 1].set_index("item_id_movie",drop=True).drop(columns=["item_id_book","is_movie"])
X_vectors_books = X_vectors[X_vectors.is_movie == 0].set_index("item_id_book",drop=True).drop(columns=["item_id_movie","is_movie"])

In [19]:
X_vectors_good_books = pd.merge(X_vectors_books,avg_ratings_books,how="left",on="item_id_book")
X_vectors_good_books = X_vectors_good_books[X_vectors_good_books.rating >= 4].drop(columns=["rating"])
X_vectors_good_books.head(2)

  X_vectors_good_books = pd.merge(X_vectors_books,avg_ratings_books,how="left",on="item_id_book")


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,clustering_label_bert
item_id_book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45524554.0,-0.047604,0.034708,-0.04214,0.023047,0.029068,-0.003054,0.122946,0.034663,0.016583,0.012794,...,-0.076962,0.074651,0.008275,-0.015889,0.030595,0.049729,-0.015802,0.001648,-0.044022,1
44707128.0,-0.042919,-0.050687,0.070148,-0.017286,-0.00512,0.042632,0.030215,-0.024918,0.116753,-0.010577,...,-0.010735,-0.023382,0.011517,0.038426,0.049266,-0.036537,-0.039882,-0.009105,-0.108453,2


In [20]:
print(f"X_vectors_books: {X_vectors_books.shape} | X_vectors_books.ratings:{X_vectors_good_books.shape}")

X_vectors_books: (5076, 385) | X_vectors_books.ratings:(2149, 385)


# Recommendations

In [21]:
## Method 1: For each film in user list, calculate the cosine similarity with all books in the cluster
## Then, sort the books by their cosine similarity to identify **the** **closest** book for each film
## Finally, take the full list of book recommendations and then identify the **top 5 most frequent** books

def get_local_reccs(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books. Clusters are not used for time being
        ### movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].cluster_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [22]:
## Method 2: Calculate the average vector for all films in the user list, and then identify the corresponding cluster
## Then,calculate the cosine similarity with all books in the cluster
## Finally, sort the books by their cosine similarities and take the **top 5 closest** books

def get_global_reccs(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie").drop(columns=["clustering_label_bert"])
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)])
    books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with ratings

In [23]:
def get_local_reccs_rating (user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books. Clusters are not used for time being
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_good_books.drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [24]:
def get_global_reccs_rating (user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie").drop(columns=["clustering_label_bert"])
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)])
    books_vectors = X_vectors_good_books.drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with clustering

In [25]:
def get_local_reccs_cluster(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books.
        movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].clustering_label_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_books[X_vectors_books.clustering_label_bert == movie_cluster].drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [26]:
def get_global_reccs_cluster(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie")
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])
    
    ## Find cluster of nearest item (film)
    sim_movies = cosine_similarity(all_movies_vectors,avg_movie_vector)
    sim_movies_detail = pd.DataFrame(sim_movies,
                                     index=all_movies_vectors.index,
                                     columns=["similarity"]).sort_values("similarity",ascending=False).reset_index()
    closest_movie_id = sim_movies_detail.loc[0].item_id_movie
    closest_cluster = X_vectors_movies[X_vectors_movies.index == closest_movie_id].clustering_label_bert.values[0]
    books_vectors = X_vectors_books[X_vectors_books.clustering_label_bert== closest_cluster].drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with clusters and ratings

In [27]:
def get_local_reccs_cluster_rating(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books.
        movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].clustering_label_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_good_books[X_vectors_books.clustering_label_bert == movie_cluster].drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [28]:
def get_global_reccs_cluster_rating(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie")
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])
    
    ## Find cluster of nearest item (film)
    sim_movies = cosine_similarity(all_movies_vectors,avg_movie_vector)
    sim_movies_detail = pd.DataFrame(sim_movies,
                                     index=all_movies_vectors.index,
                                     columns=["similarity"]).sort_values("similarity",ascending=False).reset_index()
    closest_movie_id = sim_movies_detail.loc[0].item_id_movie
    closest_cluster = X_vectors_movies[X_vectors_movies.index == closest_movie_id].clustering_label_bert.values[0]
    books_vectors = X_vectors_good_books[X_vectors_good_books.clustering_label_bert== closest_cluster].drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","img_book","url_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Illustrative results

In [77]:
films = ["Harry Potter and the Goblet of Fire (2005)",
         "Maze Runner, The (2014)",
         "Harry Potter and the Deathly Hallows: Part 2 (2011)"]

suggested_film_ids = []

for film in films:
    if film in X_all.title_movie.tolist():
        movie_id = X_all[X_all.title_movie == film].item_id_movie.values[0]
        suggested_film_ids.append(movie_id)

In [78]:
suggested_film_ids

[40815.0, 114180.0, 88125.0]

### Local apporach

In [79]:
get_local_reccs(suggested_film_ids)

Inputted films
   item_id_movie                                        title_movie
0        40815.0         Harry Potter and the Goblet of Fire (2005)
1       114180.0                            Maze Runner, The (2014)
2        88125.0  Harry Potter and the Deathly Hallows: Part 2 (...


0    Harry Potter and the Prisoner of Azkaban (Harr...
1              House of Secrets (House of Secrets, #1)
2               Frost Like Night (Snow Like Ashes, #3)
Name: title_book, dtype: object

In [80]:
get_local_reccs_rating(suggested_film_ids)

Inputted films
   item_id_movie                                        title_movie
0        40815.0         Harry Potter and the Goblet of Fire (2005)
1       114180.0                            Maze Runner, The (2014)
2        88125.0  Harry Potter and the Deathly Hallows: Part 2 (...


0    Harry Potter and the Prisoner of Azkaban (Harr...
1                   The Short Drop (Gibson Vaughn, #1)
2    Harry Potter and the Sorcerer's Stone (Harry P...
Name: title_book, dtype: object

In [81]:
get_local_reccs_cluster(suggested_film_ids)

Inputted films
   item_id_movie                                        title_movie
0        40815.0         Harry Potter and the Goblet of Fire (2005)
1       114180.0                            Maze Runner, The (2014)
2        88125.0  Harry Potter and the Deathly Hallows: Part 2 (...


0    Harry Potter and the Prisoner of Azkaban (Harr...
1              House of Secrets (House of Secrets, #1)
2    Harry Potter and the Sorcerer's Stone (Harry P...
Name: title_book, dtype: object

In [32]:
# get_local_reccs_cluster_rating([1,2,3,4,5,6,7])

### Global approach

In [82]:
get_global_reccs(suggested_film_ids)

Inputted films
0           Harry Potter and the Goblet of Fire (2005)
1                              Maze Runner, The (2014)
2    Harry Potter and the Deathly Hallows: Part 2 (...
Name: title_movie, dtype: object
Top 5 book recommendations


0    Fantastic Beasts and Where to Find Them: The O...
1    Harry Potter and the Cursed Child - Parts One ...
2    Harry Potter and the Prisoner of Azkaban (Harr...
3              The Magician's Land (The Magicians, #3)
4               Frost Like Night (Snow Like Ashes, #3)
Name: title_book, dtype: object

In [83]:
get_global_reccs_rating(suggested_film_ids)

Inputted films
0           Harry Potter and the Goblet of Fire (2005)
1                              Maze Runner, The (2014)
2    Harry Potter and the Deathly Hallows: Part 2 (...
Name: title_movie, dtype: object
Top 5 book recommendations


0    Fantastic Beasts and Where to Find Them: The O...
1    Harry Potter and the Prisoner of Azkaban (Harr...
2              The Magician's Land (The Magicians, #3)
3    Harry Potter and the Deathly Hallows (Harry Po...
4                  Blood Rites (The Dresden Files, #6)
Name: title_book, dtype: object

In [84]:
get_global_reccs_cluster(suggested_film_ids)

Inputted films
0           Harry Potter and the Goblet of Fire (2005)
1                              Maze Runner, The (2014)
2    Harry Potter and the Deathly Hallows: Part 2 (...
Name: title_movie, dtype: object
Top 5 book recommendations


0    Fantastic Beasts and Where to Find Them: The O...
1    Harry Potter and the Cursed Child - Parts One ...
2    Harry Potter and the Prisoner of Azkaban (Harr...
3              The Magician's Land (The Magicians, #3)
4              House of Secrets (House of Secrets, #1)
Name: title_book, dtype: object

In [85]:
get_global_reccs_cluster_rating(suggested_film_ids)

Inputted films
0           Harry Potter and the Goblet of Fire (2005)
1                              Maze Runner, The (2014)
2    Harry Potter and the Deathly Hallows: Part 2 (...
Name: title_movie, dtype: object
Top 5 book recommendations


0    Fantastic Beasts and Where to Find Them: The O...
1    Harry Potter and the Prisoner of Azkaban (Harr...
2              The Magician's Land (The Magicians, #3)
3    Harry Potter and the Deathly Hallows (Harry Po...
4    Harry Potter and the Sorcerer's Stone (Harry P...
Name: title_book, dtype: object

# Identification of good book recommendation

In [37]:
def get_local_reccs_movies(user_books:list):
    
    verified_books = [book_id for book_id in user_books if book_id in X_all.item_id_book.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_movie"])
    books = pd.DataFrame(verified_books,columns=["item_id_book"])
    books = pd.merge(books,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    for book_id in verified_books:

        # Obtain vectors for user-inputted book
        book_cluster = X_vectors_books[X_vectors_books.index == book_id].clustering_label_bert.values[0]
        book_vector = X_vectors_books[X_vectors_books.index == book_id].drop(columns=["clustering_label_bert"])
        movies_vectors = X_vectors_movies[X_vectors_movies.clustering_label_bert == book_cluster].drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_movies = cosine_similarity(movies_vectors,book_vector)
 
        # Create summary table of movie with their similarity and relevant details
        sim_movies_detail = pd.DataFrame(sim_movies,index=movies_vectors.index,columns=["similarity"])
        sim_movies_detail = sim_movies_detail.sort_values("similarity",ascending=False)
        sim_movies_detail = pd.merge(sim_movies_detail,X_all[["title_movie","item_id_movie"]],
                                    on="item_id_movie",how="left")

        # Add top movie to recommendations dataframe
        top_movie = pd.DataFrame([sim_movies_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_movie],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(books)
    return recommendations["title_movie"]

In [38]:
def get_global_reccs_movies(user_books:list):
    
    verified_books = [book_id for book_id in user_books if book_id in X_all.item_id_book.tolist()]
    
    ## Collect vectors of all inputted books and calculate average vector
    books_id = pd.DataFrame(verified_books,columns=["item_id_book"])
    books_vectors = pd.merge(books_id,
                              X_vectors_books,
                              how="left",
                              on="item_id_book").set_index("item_id_book")
    avg_book_vector = pd.DataFrame([books_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])
    
    ## Find cluster of nearest item (book)
    sim_books = cosine_similarity(all_books_vectors,avg_book_vector)
    sim_books_detail = pd.DataFrame(sim_books,
                                     index=all_books_vectors.index,
                                     columns=["similarity"]).sort_values("similarity",ascending=False).reset_index()
    closest_book_id = sim_books_detail.loc[0].item_id_book
    closest_cluster = X_vectors_books[X_vectors_books.index == closest_book_id].clustering_label_bert.values[0]
    movies_vectors = X_vectors_movies[X_vectors_movies.clustering_label_bert== closest_cluster].drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_movies = cosine_similarity(movies_vectors,avg_book_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_movies_detail = pd.DataFrame(sim_movies,index=movies_vectors.index,columns=["similarity"])
    sim_movies_detail = sim_movies_detail.sort_values("similarity",ascending=False)
    sim_movies_detail = pd.merge(sim_movies_detail,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_movies_detail.head(5)
    book_titles = pd.merge(books_id,X_all[["title_book","item_id_book"]],how="inner",on="item_id_book")
    print("Inputted books")
    print(book_titles.title_book)
    print ("Top 5 movie recommendations")
    return recommendations["title_movie"]

In [42]:
books = pd.DataFrame(X_all[X_all.is_movie==0].title_book).sort_values(by=["title_book"])
books = books.title_book.to_list()
# print(*books, sep='\n')

In [74]:
harry_potter = [book for book in books if "Potter" in book]
harry_potter

['Harry Potter Boxset (Harry Potter, #1-7)',
 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',
 'Harry Potter and the Cursed Child - Parts One and Two (Harry Potter, #8)',
 'Harry Potter and the Deathly Hallows (Harry Potter, #7)',
 'Harry Potter and the Half-Blood Prince (Harry Potter, #6)',
 'Harry Potter and the Methods of Rationality',
 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
 "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
 'Harry Potter: The Prequel (Harry Potter, #0.5)',
 "James Potter and the Hall of Elders' Crossing (James Potter, #1)",
 'Short Stories from Hogwarts of Power, Politics and Pesky Poltergeists (Pottermore Presents, #2)']

In [None]:
suggested_books = ["Treasure Island",
                   "Moby-Dick or, The Whale",
                   "Twenty Thousand Leagues Under the Sea"]

In [69]:
potter_books = ["Fantastic Beasts and Where to Find Them"]

In [70]:
suggested_book_ids = []

for book in potter_books:
    if book in X_all.title_book.tolist():
        book_id = X_all[X_all.title_book == book].item_id_book.values[0]
        suggested_book_ids.append(book_id)

In [71]:
suggested_book_ids

[4195128.0]

In [72]:
get_local_reccs_movies(suggested_book_ids)

Inputted films
   item_id_book                               title_book
0     4195128.0  Fantastic Beasts and Where to Find Them


0    Harry Potter and the Goblet of Fire (2005)
Name: title_movie, dtype: object

In [73]:
get_global_reccs_movies(suggested_book_ids)

Inputted books
0    Fantastic Beasts and Where to Find Them
Name: title_book, dtype: object
Top 5 movie recommendations


0           Harry Potter and the Goblet of Fire (2005)
1    Harry Potter and the Deathly Hallows: Part 2 (...
2                              Maze Runner, The (2014)
3                                  Going Postal (2010)
4    Percy Jackson & the Olympians: The Lightning T...
Name: title_movie, dtype: object