In [2]:
###############
### IMPORTS ###
###############

import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/dataframe_merged.csv', usecols=['id', 'title', 'genres'])

In [4]:
df.head(5)

Unnamed: 0,genres,id,title
0,"['Animation', 'Comedy', 'Family']",862,Toy Story
1,"['Adventure', 'Fantasy', 'Family']",8844,Jumanji
2,"['Romance', 'Comedy']",15602,Grumpier Old Men
3,"['Comedy', 'Drama', 'Romance']",31357,Waiting to Exhale
4,['Comedy'],11862,Father of the Bride Part II


In [6]:
cos_sim_overview = np.load('cosine_similarity/cos_overview.npy')

In [7]:
df_cos_sim_overview = pd.DataFrame(cos_sim_overview)

In [8]:
cos_sim_metadata = np.load('cosine_similarity/cos_metadata.npy')

In [10]:
df_cos_sim_metadata = pd.DataFrame(cos_sim_metadata)

In [55]:
# Indices will track the title from our dataframe
indices = pd.Series(df.index, index=df['title'])

def get_recommendations_tunable(title, alpha, beta):
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the cosine similarity from metadata, overview, (and collaborative)
    # Include tunable coefficients (alpha, beta, gamma) to the cosine matrix (must equal 1)
    sim_score_total = list(enumerate((alpha * cos_sim_metadata[idx])/(alpha+beta) + \
                                        (beta * cos_sim_overview[idx])/(alpha+beta)
                                    )
    #                                       + gamma * cos_sim_collaborative[idx]
                          )

    # Get the scores of the 10 most similar movies
    sim_score_total = sorted(sim_score_total, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_score_total = sim_score_total[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_score]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [56]:
get_recommendations_tunable('Toy Story', alpha=1.0, beta=0)

3024                    Toy Story 2
22126          Toy Story of Terror!
25999               Partysaurus Rex
26001    Toy Story That Time Forgot
29198               Superstar Goofy
15519                   Toy Story 3
3336              Creature Comforts
41622                       Lorenzo
10754                      Luxo Jr.
19301                       Tin Toy
Name: title, dtype: object


In [40]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_opposites(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_overview[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

NameError: name 'cosine_sim' is not defined

In [None]:
def get_ensemble_recs(movie_id, content_embeddings, collaborative_embeddings, file_path):
    #get similar movies from content
    sim_model_cont = SimilarityPredictions(content_embeddings, similarity_metric="cosine")
    cont_output = sim_model_cont.predict_similar_items(seed_item=movie_id, n=26744)
    similar_movies = pd.DataFrame(cont_output)
    similar_movies.set_index('item_id', inplace=True)
    sim_df_cont = pd.merge(movies, similar_movies, left_index=True, right_index=True)
    sim_df_cont.sort_values('similarity_score', ascending=False, inplace=True)
    sim_df_cont = sim_df_cont.rename(index=str, columns={"similarity_score": "content_similarity_score"})

    #get similar movies from collaborative
    sim_model_coll = SimilarityPredictions(collaborative_embeddings, similarity_metric="cosine")
    coll_output = sim_model_coll.predict_similar_items(seed_item=movie_id, n=26744)
    similar_movies = pd.DataFrame(coll_output)
    similar_movies.set_index('item_id', inplace=True)
    sim_df_coll = pd.merge(movies, similar_movies, left_index=True, right_index=True)
    sim_df_coll.sort_values('similarity_score', ascending=False, inplace=True)
    sim_df_coll = sim_df_coll.rename(index=str, columns={"similarity_score": "collaborative_similarity_score"})

    #ensemble results
    sim_df_avg = pd.merge(sim_df_coll, pd.DataFrame(sim_df_cont['content_similarity_score']), left_index=True, right_index=True)
    sim_df_avg['average_similarity_score'] = (sim_df_avg['content_similarity_score'] + sim_df_avg['collaborative_similarity_score'])/2
    #sim_df_avg.drop("collaborative_similarity_score", axis=1, inplace=True)
    #sim_df_avg.drop("content_similarity_score", axis=1, inplace=True)
    sim_df_avg.sort_values('average_similarity_score', ascending=False, inplace=True)
    
    #save recs locally
    sim_df_avg.head(20).to_csv(file_path, index=False, header=True)
    return sim_df_avg.head(20)