In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
movies = pd.read_csv("movies.csv")
genome_scores = pd.read_csv("genome_scores.csv")
genome_tags = pd.read_csv("genome_tags.csv")
train = pd.read_csv("train.csv")

# Create consistent mapping of movieId to array indices
movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies['movieId'])}

# Content-Based Filtering
# Merge genome_scores and genome_tags to get movie tag relevance
movie_tags = genome_scores.merge(genome_tags, on='tagId')
movie_tags = movie_tags.pivot(index='movieId', columns='tag', values='relevance').fillna(0)

# Prepare movie features - keep original structure but fix alignment
movies["genres"] = movies["genres"].fillna("")

# Create features for each movie
movie_features = []
for _, row in movies.iterrows():
    movie_id = row['movieId']
    genre_text = row['genres']
    
    # Get tags for this movie if available
    tag_text = ""
    if movie_id in movie_tags.index:
        tags = movie_tags.loc[movie_id]
        relevant_tags = tags[tags > 0.1]
        if not relevant_tags.empty:
            tag_text = " ".join(relevant_tags.index)
    
    # Combine features
    movie_features.append(f"{genre_text} {tag_text}".strip())

# Compute TF-IDF 
vectorizer = TfidfVectorizer()
movie_tfidf = vectorizer.fit_transform(movie_features)

def recommend_movies_content(movie_title, top_n=10):
    # Normalize input (strip spaces and lowercase)
    movie_title = movie_title.strip().lower()
    
    # Find exact match without regex issues
    matching_movies = movies[movies['title'].str.lower().str.contains(movie_title, regex=False)]
    
    if matching_movies.empty:
        return f"Movie '{movie_title}' not found in dataset."
    
    # Get first matching movie's index
    movie_row = matching_movies.iloc[0]
    movie_id = movie_row['movieId']
    
    # Convert movieId to the correct index in our TF-IDF matrix
    movie_idx = movie_id_to_idx[movie_id]
    
    # Compute similarity ONLY for the selected movie (use reshape to ensure 2D)
    movie_vector = movie_tfidf[movie_idx:movie_idx+1]
    movie_sim = cosine_similarity(movie_vector, movie_tfidf).flatten()
    
    # Get top N most similar movies (excluding itself)
    sim_scores = np.argsort(movie_sim)[::-1][1:top_n+1]
    
    recommended_movies = movies.iloc[sim_scores]['title'].tolist()
    return recommended_movies


In [15]:
# Example usage
print(recommend_movies_content("Cars 3 (2017)"))

['Cars 2 (2011)', 'Monsters University (2013)', 'Despicable Me 2 (2013)', 'Shrek Forever After (a.k.a. Shrek: The Final Chapter) (2010)', 'Finding Dory (2016)', 'Ice Age: Dawn of the Dinosaurs (2009)', 'Shrek the Third (2007)', 'Planes (2013)', 'Cars (2006)', 'Shrek 2 (2004)']
