In [2]:
import pandas as pd
from IPython.display import display

# Load Datasets
movies = pd.read_csv('../data/cleaned_movies.csv')
ratings = pd.read_csv('../data/cleaned_ratings.csv')

display(movies.head(10))

Unnamed: 0,movieId,title,genres,year,genre_list,combined_features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,"['Adventure', 'Animation', 'Children', 'Comedy...",Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,"['Adventure', 'Children', 'Fantasy']",Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,"['Comedy', 'Romance']",Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,"['Comedy', 'Drama', 'Romance']",Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,1995,['Comedy'],Father of the Bride Part II (1995) Comedy
5,6,Heat (1995),Action|Crime|Thriller,1995,"['Action', 'Crime', 'Thriller']",Heat (1995) Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance,1995,"['Comedy', 'Romance']",Sabrina (1995) Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children,1995,"['Adventure', 'Children']",Tom and Huck (1995) Adventure|Children
8,9,Sudden Death (1995),Action,1995,['Action'],Sudden Death (1995) Action
9,10,GoldenEye (1995),Action|Adventure|Thriller,1995,"['Action', 'Adventure', 'Thriller']",GoldenEye (1995) Action|Adventure|Thriller


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Check the shape of the matrix
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (87585, 42529)


In [6]:
from scipy.sparse import issparse

# Check if the matrix is sparse (Scipy format)
if issparse(tfidf_matrix):
    print("The matrix is sparse (Scipy format).")
else:
    # Calculate sparsity
    total_elements = matrix.size
    zero_elements = (matrix == 0).sum()
    sparsity = zero_elements / total_elements
    
    print(f"Sparsity: {sparsity:.2f}")
    if sparsity > 0.5:
        print("The matrix is sparse.")
    else:
        print("The matrix is dense.")

The matrix is sparse (Scipy format).


In [8]:
# Create a Series to map movie titles to their indices
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
print(movie_indices.head(10))

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
Heat (1995)                           5
Sabrina (1995)                        6
Tom and Huck (1995)                   7
Sudden Death (1995)                   8
GoldenEye (1995)                      9
dtype: int64


In [10]:
def find_similar_movies(file_name, target_index, movie_indices, top_n=10):
    """
    Finds the top N similar movies to a target movie using cosine similarity.
    Args:
        file_name (str): Path to the HDF5 file containing cosine similarities.
        target_index (int): Index of the target movie.
        movie_indices (pd.Series): Mapping of movie titles to indices.
        top_n (int): Number of similar movies to return.
    Returns:
        list: List of tuples containing similar movie titles and their similarity scores.
    """
    with h5py.File(file_name, 'r') as f:
        # Access the full row for the target movie
        similarities = None
        for batch_name in f.keys():
            print("Searching batch name : ", batch_name)
            dataset = f[batch_name]
            if target_index < dataset.shape[0]:  # Check if the index is within this batch
                similarities = dataset[target_index]
                break

        if similarities is None:
            raise ValueError(f"Index {target_index} not found in HDF5 file.")

    # Get the top N similar movies (excluding itself)
    similar_indices = np.argsort(-similarities)  # Sort in descending order
    similar_indices = [i for i in similar_indices if i != target_index][:top_n]

    # Map indices back to movie titles
    index_to_title = {v: k for k, v in movie_indices.items()}
    recommendations = [(index_to_title[idx], similarities[idx]) for idx in similar_indices]

    print("******************************************\n")
    print(recommendations[0])
    print("\n******************************************\n")
    return recommendations


In [12]:
def search_hdf5(file_name, query, tolerance=0.01):
    """
    Search for a specific value or pattern in an HDF5 file containing 2D arrays.
    Args:
        file_name (str): Path to the HDF5 file.
        query (float): Value to search for.
        tolerance (float): Optional tolerance for approximate matching.
    Returns:
        list: List of (batch_name, indices) where the query was found.
    """
    results = []
    with h5py.File(file_name, 'r') as f:
        for batch_name in f.keys():
            # Access the dataset
            dataset = f[batch_name]
            # Find where the dataset matches the query within tolerance
            matches = np.where(np.abs(dataset[:] - query) <= tolerance)
            if matches[0].size > 0:  # If matches are found
                results.append((batch_name, matches))
    return results

In [14]:
toy_story_index = movie_indices['Toy Story (1995)']  # Replace with the exact title
print("Toy Story Index : ", toy_story_index)

Toy Story Index :  0


In [16]:
import h5py
import numpy as np

# Find the top 10 recommendations for 'Toy Story'
recommendations = find_similar_movies(
    file_name='../models/cosine_similarities.h5',
    target_index=toy_story_index,
    movie_indices=movie_indices,
    top_n=10
)

# Print recommendations
print("Top 10 recommendations for 'Toy Story':")
for title, score in recommendations:
    print(f"{title}: {score:.4f}")

Searching batch name :  batch_0
Top 10 recommendations for 'Toy Story':
Toy Story 2 (1999): 0.8543
Toy Story 4 (2019): 0.8313
Toy Story 3 (2010): 0.7790
Toy Story of Terror (2013): 0.6802
Toy Story Toons: Small Fry (2011): 0.5669
Toy Story That Time Forgot (2014): 0.5616
Toy Story Toons: Hawaiian Vacation (2011): 0.5600
Toy, The (1982): 0.5344
The New Toy (2022): 0.4810
Boy Toy (2011): 0.4776


In [31]:
import pickle

# # Save the cosine similarity matrix
# with open('data/recommendation/cosine_sim.pkl', 'wb') as f:
#     pickle.dump(cosine_sim, f)

# Save the movie indices mapping
with open('../models/movie_indices.pkl', 'wb') as f:
    pickle.dump(movie_indices, f)