# Imports

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import gensim.downloader as api

# Data Loading

In [2]:
## Load cleaned data
X_all = pd.read_csv("/Users/egmac/code/arostagnat/BookMatch/data/proc_data/cluster_result/X_all.csv")

In [61]:
X_all.shape

(61455, 8)

In [5]:
## Import book ratings for recommendation filtering
ratings_books =  pd.read_json("/Users/egmac/code/arostagnat/BookMatch/data/raw_data/raw_book/ratings.json", lines=True)
ratings_books = ratings_books.rename(columns={"item_id":"item_id_book"})

In [7]:
## Calculate average rating for each book
avg_ratings_books = ratings_books.groupby(["item_id_book"]).mean().drop(columns=["user_id"])
print(f"Original ratings df: {ratings_books.shape} | Average ratings df: {avg_ratings_books.shape}")

Original ratings df: (5152656, 3) | Average ratings df: (9374, 1)


# Post-processing

In [8]:
## Import relevant packages

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
## Create extract of X_all with vectors spread across columns, and confirm relevance of dimensions
## Vectors have to be reformatted as lists, as they are formatted as strings with ""

vectors = X_all.vector.tolist()
vectors_revised = []

for vector in vectors:
    result = vector.strip('[]').replace("'","").replace("\n","").split()
    result = [float(i) for i in result]
    vectors_revised.append(result)

X_vectors = pd.DataFrame(vectors_revised)

In [11]:
# svd = TruncatedSVD(n_components=X_vectors.shape[1])
# svd_result = svd.fit_transform(X_vectors)

In [44]:
## Plot variance as a function of the number of components.
## Based on the below figure, nearly 100% of the variance is explained by 250 components
# plt.plot(svd.explained_variance_ratio_.cumsum())
# plt.xlabel('Number of singular value components')
# plt.ylabel('Cumulative percent of variance')   
# plt.grid()
# plt.show()

In [23]:
## Reshape vectors to 250 components, which will help reduce computational time
# n = 250
# X_vectors_revised = pd.DataFrame(X_vectors.iloc[:,0:n])
# print(f'X_all shape: {X_all.shape} | X_vectors shape: {X_vectors.shape}')

In [10]:
## Add qualitative columns to X_vectors_revised, and then create 2 separate dataframes for books and movies.
## Movie dataframe to be used to lookup user-inputted movies. Book dataframe to be used for calculations.
## Note that the dataframes need to be separated eventually, so it's worth doing now.

X_vectors[["item_id_movie","item_id_book","is_movie","clustering_label_bert"]] = X_all[["item_id_movie","item_id_book","is_movie","clustering_label_bert"]]
X_vectors_movies = X_vectors[X_vectors.is_movie == 1].set_index("item_id_movie",drop=True).drop(columns=["item_id_book","is_movie"])
X_vectors_books = X_vectors[X_vectors.is_movie == 0].set_index("item_id_book",drop=True).drop(columns=["item_id_movie","is_movie"])

In [11]:
X_vectors_good_books = pd.merge(X_vectors_books,avg_ratings_books,how="left",on="item_id_book")
X_vectors_good_books = X_vectors_good_books[X_vectors_good_books.rating >= 4].drop(columns=["rating"])
X_vectors_good_books.head(2)

  X_vectors_good_books = pd.merge(X_vectors_books,avg_ratings_books,how="left",on="item_id_book")


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,clustering_label_bert
item_id_book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49566885.0,-0.067562,-0.01381,0.087461,0.010315,-0.03419,0.044712,-0.042676,-0.003757,-0.01911,-0.066859,...,-0.013075,-0.029805,0.044801,-0.076599,0.070121,0.072632,0.016956,-0.016898,-0.009537,2048
48125855.0,-0.021348,-0.130383,0.065956,0.012416,0.047087,0.028791,-0.077499,-0.062951,0.042728,0.001619,...,-0.011231,-0.09278,0.053753,-0.033295,0.061573,0.027644,-0.020969,-0.093529,0.010398,769


In [12]:
print(f"X_vectors_books: {X_vectors_books.shape} | X_vectors_books.ratings:{X_vectors_good_books.shape}")

X_vectors_books: (9374, 385) | X_vectors_books.ratings:(3945, 385)


# Recommendations

In [22]:
## Method 1: For each film in user list, calculate the cosine similarity with all books in the cluster
## Then, sort the books by their cosine similarity to identify **the** **closest** book for each film
## Finally, take the full list of book recommendations and then identify the **top 5 most frequent** books

def get_local_reccs(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books. Clusters are not used for time being
        ### movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].cluster_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [23]:
## Method 2: Calculate the average vector for all films in the user list, and then identify the corresponding cluster
## Then,calculate the cosine similarity with all books in the cluster
## Finally, sort the books by their cosine similarities and take the **top 5 closest** books

def get_global_reccs(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie").drop(columns=["clustering_label_bert"])
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)])
    books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with ratings

In [24]:
def get_local_reccs_rating (user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books. Clusters are not used for time being
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_good_books.drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [25]:
def get_global_reccs_rating (user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie").drop(columns=["clustering_label_bert"])
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)])
    books_vectors = X_vectors_good_books.drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with clustering

In [26]:
def get_local_reccs_cluster(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book","img_book","url_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books.
        movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].clustering_label_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_books[X_vectors_books.clustering_label_bert == movie_cluster].drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [27]:
def get_global_reccs_cluster(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie")
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])
    
    ## Find cluster of nearest item (film)
    sim_movies = cosine_similarity(all_movies_vectors,avg_movie_vector)
    sim_movies_detail = pd.DataFrame(sim_movies,
                                     index=all_movies_vectors.index,
                                     columns=["similarity"]).sort_values("similarity",ascending=False).reset_index()
    closest_movie_id = sim_movies_detail.loc[0].item_id_movie
    closest_cluster = X_vectors_movies[X_vectors_movies.index == closest_movie_id].clustering_label_bert.values[0]
    books_vectors = X_vectors_books[X_vectors_books.clustering_label_bert== closest_cluster].drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Recommendations with clusters and ratings

In [28]:
def get_local_reccs_cluster_rating(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_book"])
    movies = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies = pd.merge(movies,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    for movie_id in verified_movies:

        # Obtain vectors for user-inputted film and all books.
        movie_cluster = X_vectors_movies[X_vectors_movies.index == movie_id].clustering_label_bert.values[0]
        movie_vector = X_vectors_movies[X_vectors_movies.index == movie_id].drop(columns=["clustering_label_bert"])
        books_vectors = X_vectors_good_books[X_vectors_books.clustering_label_bert == movie_cluster].drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_books = cosine_similarity(books_vectors,movie_vector)
 
        # Create summary table of books with their similarity and relevant details
        sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
        sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
        sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],
                                    on="item_id_book",how="left")

        # Add top book to recommendations dataframe
        top_book = pd.DataFrame([sim_books_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_book],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(movies)
    return recommendations["title_book"]


In [29]:
def get_global_reccs_cluster_rating(user_movies:list):
    
    verified_movies = [movie_id for movie_id in user_movies if movie_id in X_all.item_id_movie.tolist()]
    
    ## Collect vectors of all inputted films and calculate average vector
    movies_id = pd.DataFrame(verified_movies,columns=["item_id_movie"])
    movies_vectors = pd.merge(movies_id,
                              X_vectors_movies,
                              how="left",
                              on="item_id_movie").set_index("item_id_movie")
    avg_movie_vector = pd.DataFrame([movies_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])
    
    ## Find cluster of nearest item (film)
    sim_movies = cosine_similarity(all_movies_vectors,avg_movie_vector)
    sim_movies_detail = pd.DataFrame(sim_movies,
                                     index=all_movies_vectors.index,
                                     columns=["similarity"]).sort_values("similarity",ascending=False).reset_index()
    closest_movie_id = sim_movies_detail.loc[0].item_id_movie
    closest_cluster = X_vectors_movies[X_vectors_movies.index == closest_movie_id].clustering_label_bert.values[0]
    books_vectors = X_vectors_good_books[X_vectors_good_books.clustering_label_bert== closest_cluster].drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_books = cosine_similarity(books_vectors,avg_movie_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_books_detail = pd.DataFrame(sim_books,index=books_vectors.index,columns=["similarity"])
    sim_books_detail = sim_books_detail.sort_values("similarity",ascending=False)
    sim_books_detail = pd.merge(sim_books_detail,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_books_detail.head(5)
    movie_titles = pd.merge(movies_id,X_all[["title_movie","item_id_movie"]],how="inner",on="item_id_movie")
    print("Inputted films")
    print(movie_titles.title_movie)
    print ("Top 5 book recommendations")
    return recommendations["title_book"]

# Illustrative results

### Local apporach

In [30]:
get_local_reccs([1,2,3,4,5,6,7])

Inputted films
   item_id_movie                         title_movie
0              1                    Toy Story (1995)
1              2                      Jumanji (1995)
2              3             Grumpier Old Men (1995)
3              4            Waiting to Exhale (1995)
4              5  Father of the Bride Part II (1995)
5              6                         Heat (1995)
6              7                      Sabrina (1995)


0                                  Waiting
1     The Serpent of Venice (The Fool, #2)
2                                 Cranford
3            White Hot (Hidden Legacy, #2)
4    Road to Nowhere (Road to Nowhere, #1)
5     Field of Prey (Lucas Davenport, #24)
6                  Devoured (Devoured, #1)
Name: title_book, dtype: object

In [32]:
get_local_reccs_rating([1,2,3,4,5,6,7])

Inputted films
   item_id_movie                         title_movie
0              1                    Toy Story (1995)
1              2                      Jumanji (1995)
2              3             Grumpier Old Men (1995)
3              4            Waiting to Exhale (1995)
4              5  Father of the Bride Part II (1995)
5              6                         Heat (1995)
6              7                      Sabrina (1995)


0                                              Waiting
1    The Hunger Games Trilogy Boxset (The Hunger Ga...
2                    Feral Sins (The Phoenix Pack, #1)
3                        White Hot (Hidden Legacy, #2)
4                Road to Nowhere (Road to Nowhere, #1)
5                 Field of Prey (Lucas Davenport, #24)
6                    I Wish You Were Mine (Oxford, #2)
Name: title_book, dtype: object

In [None]:
get_local_reccs_cluster([1,2,3,4,5,6,7])

In [None]:
get_local_reccs_cluster_rating([1,2,3,4,5,6,7])

### Global approach

In [34]:
get_global_reccs([1,2,3,4,5,6,7])

Inputted films
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
Name: title_movie, dtype: object
Top 5 book recommendations


0           Ghost World
1             Gone Girl
2              Cranford
3    The Stepford Wives
4               Flipped
Name: title_book, dtype: object

In [35]:
get_global_reccs_rating([1,2,3,4,5,6,7])

Inputted films
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
Name: title_movie, dtype: object
Top 5 book recommendations


0                                Flipped
1                        Trust Your Eyes
2    Hollywood Dirt (Hollywood Dirt, #1)
3                             Tiger Lily
4       Sustained (The Legal Briefs, #2)
Name: title_book, dtype: object

In [36]:
get_global_reccs_cluster([1,2,3,4,5,6,7])

Inputted films
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
Name: title_movie, dtype: object
Top 5 book recommendations


0    Straight Man
Name: title_book, dtype: object

In [37]:
get_global_reccs_cluster_rating([1,2,3,4,5,6,7])

Inputted films
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
Name: title_movie, dtype: object
Top 5 book recommendations


0    Straight Man
Name: title_book, dtype: object

# Identification of good book recommendation

In [40]:
def get_local_reccs_movies(user_books:list):
    
    verified_books = [book_id for book_id in user_books if book_id in X_all.item_id_book.tolist()]
    
    recommendations = pd.DataFrame(columns=["similarity","title_movie"])
    books = pd.DataFrame(verified_books,columns=["item_id_book"])
    books = pd.merge(books,X_all[["title_book","item_id_book"]],on="item_id_book",how="left")
    
    for book_id in verified_books:

        # Obtain vectors for user-inputted book
        book_vector = X_vectors_books[X_vectors_books.index == book_id].drop(columns=["clustering_label_bert"])
        movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])

        # Calculate cosine similarity
        sim_movies = cosine_similarity(movies_vectors,book_vector)
 
        # Create summary table of movie with their similarity and relevant details
        sim_movies_detail = pd.DataFrame(sim_movies,index=movies_vectors.index,columns=["similarity"])
        sim_movies_detail = sim_movies_detail.sort_values("similarity",ascending=False)
        sim_movies_detail = pd.merge(sim_movies_detail,X_all[["title_movie","item_id_movie"]],
                                    on="item_id_movie",how="left")

        # Add top movie to recommendations dataframe
        top_movie = pd.DataFrame([sim_movies_detail.loc[0]])
        recommendations = pd.concat([recommendations,top_movie],axis=0, ignore_index=True)
    
    print("Inputted films")
    print(books)
    return recommendations["title_movie"]

In [41]:
def get_global_reccs_movies(user_books:list):
    
    verified_books = [book_id for book_id in user_books if book_id in X_all.item_id_book.tolist()]
    
    ## Collect vectors of all inputted books and calculate average vector
    books_id = pd.DataFrame(verified_books,columns=["item_id_book"])
    books_vectors = pd.merge(books_id,
                              X_vectors_books,
                              how="left",
                              on="item_id_book").set_index("item_id_book")
    avg_book_vector = pd.DataFrame([books_vectors.mean(numeric_only=True)]).drop(columns=["clustering_label_bert"])
    all_books_vectors = X_vectors_books.drop(columns=["clustering_label_bert"])
    
    movies_vectors = X_vectors_movies.drop(columns=["clustering_label_bert"])

    ## Calculate cosine similarity
    sim_movies = cosine_similarity(movies_vectors,avg_book_vector)

    ## Create summary table of books with their similarity and relevant details
    sim_movies_detail = pd.DataFrame(sim_movies,index=movies_vectors.index,columns=["similarity"])
    sim_movies_detail = sim_movies_detail.sort_values("similarity",ascending=False)
    sim_movies_detail = pd.merge(sim_movies_detail,X_all[["title_movie","item_id_movie"]],on="item_id_movie",how="left")
    
    ## Take top 5 books and show results
    recommendations = sim_movies_detail.head(5)
    book_titles = pd.merge(books_id,X_all[["title_book","item_id_book"]],how="inner",on="item_id_book")
    print("Inputted books")
    print(book_titles.title_book)
    print ("Top 5 movie recommendations")
    return recommendations["title_movie"]

In [71]:
def get_books(keyword):
    books = pd.DataFrame(X_all[X_all.is_movie==0].title_book).sort_values(by=["title_book"])
    books = books.title_book.to_list()
    return [book for book in books if keyword in book]

In [81]:
def get_books_ids(books):
    book_ids = []
    for book in books:
        if book in X_all.title_book.tolist():
            book_id = X_all[X_all.title_book == book].item_id_book.values[0]
            book_ids.append(book_id)
    return book_ids

In [104]:
get_books("1984")

['1984', 'Animal Farm / 1984']

In [105]:
example_books = ["1984"]

In [106]:
example_books_ids = get_books_ids(example_books)

In [107]:
get_local_reccs_movies(example_books_ids)

Inputted films
   item_id_book title_book
0      153313.0       1984


0    Left Behind: The Movie (2000)
Name: title_movie, dtype: object

In [108]:
get_global_reccs_movies(example_books_ids)

Inputted books
0    1984
Name: title_book, dtype: object
Top 5 movie recommendations


0               Left Behind: The Movie (2000)
1    Forest of the Gods (Dievu miskas) (2005)
2                           The Chosen (2016)
3                             Timeline (2003)
4                  Midnight's Children (2012)
Name: title_movie, dtype: object