In [2]:
import pandas as pd
from IPython.display import display

# Load Datasets
movies = pd.read_csv('../data/cleaned_remapped_movies.csv')
ratings = pd.read_csv('../data/cleaned_remapped_ratings.csv')

display(movies.head(5))

Unnamed: 0,movieId,title,genres,year,genre_list,combined_features
0,415.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,"['Adventure', 'Animation', 'Children', 'Comedy...",toy story (1995) Adventure|Animation|Children|...
1,191.0,jumanji (1995),Adventure|Children|Fantasy,1995,"['Adventure', 'Children', 'Fantasy']",jumanji (1995) Adventure|Children|Fantasy
2,941.0,grumpier old men (1995),Comedy|Romance,1995,"['Comedy', 'Romance']",grumpier old men (1995) Comedy|Romance
3,3313.0,waiting to exhale (1995),Comedy|Drama|Romance,1995,"['Comedy', 'Drama', 'Romance']",waiting to exhale (1995) Comedy|Drama|Romance
4,942.0,father of the bride part ii (1995),Comedy,1995,['Comedy'],father of the bride part ii (1995) Comedy


In [4]:
import h5py

# Create a Series to map movie titles to their indices
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
print(movie_indices.head(3))

# # Save the movie indices mapping
# with open('../models/movie_indices.pkl', 'wb') as f:
#     pickle.dump(movie_indices, f)

title
toy story (1995)           0
jumanji (1995)             1
grumpier old men (1995)    2
dtype: int64


In [6]:
# Recommendation function

def recommend_content_based(title, num_recommendations=10, h5_file='../models/cosine_sim.h5'):
    # Normalize the movie title
    normalized_title = title.lower().strip()
    
    # Get the index of the input movie
    if normalized_title not in movie_indices:
        raise ValueError(f"Movie '{title}' not found in the dataset.")
    idx = movie_indices[normalized_title]
    
    # Open the HDF5 file and retrieve the relevant row (cosine similarity scores)
    # Instead of loading the entire cosine_sim matrix into memory, retrieve only the relevant row using the index idx.
    with h5py.File(h5_file, 'r') as f:
        sim_scores = f['cosine_sim'][idx]  # Retrieve the row corresponding to the movie

    # Process the similarity scores to get recommendations
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]  # Skip the movie itself
    recommended_indices = [i[0] for i in sim_scores]
    
    # Retrieve recommended movie titles and convert to title case
    recommended_titles = movies['title'].iloc[recommended_indices].str.title()
    return recommended_titles

In [12]:
# Test the Recommendation function
print(recommend_content_based('Toy Story (1995)', num_recommendations=5))
print("\n\n")
print(recommend_content_based('Nixon (1995)'))

3021                    Toy Story 2 (1999)
60616                   Toy Story 4 (2019)
14811                   Toy Story 3 (2010)
20500           Toy Story Of Terror (2013)
22640    Toy Story Toons: Small Fry (2011)
Name: title, dtype: object



22222       Nixon By Nixon: In His Own Words (2014)
20067                              Our Nixon (2013)
12877                            Frost/Nixon (2008)
38917                          Elvis & Nixon (2016)
75425                    Searching For Nixon (2006)
37706             Dad'S In Heaven With Nixon (2007)
63350             American Experience: Nixon (1990)
1783                                  Go Now (1995)
23335                      Elvis Meets Nixon (1997)
9449     Assassination Of Richard Nixon, The (2004)
Name: title, dtype: object
