In [None]:
import pandas as pd
import numpy as np

# Load movies.csv
movies = pd.read_csv('../data/ml-latest-small/movies.csv')

# Show first 5 rows
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# =========================
# NEW MODULAR VERSION
# =========================
import sys
sys.path.append('..')

from src.data_loader import load_movies, load_ratings
from src.content_model import build_content_model
from src.collaborative_model import build_collaborative_model
from src.hybrid_model import hybrid_recommend

# Load data
movies = load_movies('../data/ml-latest-small/movies.csv')
ratings = load_ratings('../data/ml-latest-small/ratings.csv')

# Build models
cosine_sim, indices = build_content_model(movies)
collab_sim = build_collaborative_model(ratings, movies)

# Test hybrid recommendation
hybrid_recommend("Toy Story (1995)", movies, cosine_sim, indices, collab_sim)


In [None]:
print("Total movies:", movies.shape[0])


movies.columns


In [None]:
movies.columns


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Fill missing genres with empty string
movies['genres'] = movies['genres'].fillna('')

# Combine title + genres (optional but gives better recommendations)
movies['combined'] = movies['title'] + " " + movies['genres']

movies[['title', 'genres', 'combined']].head()


In [None]:
# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit & transform the combined text
tfidf_matrix = tfidf.fit_transform(movies['combined'])

# Show shape of the matrix
print(tfidf_matrix.shape)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check shape
print(cosine_sim.shape)


In [None]:
# Reset index just to be safe
movies = movies.reset_index(drop=True)

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Test: get index of a movie
print(indices['Toy Story (1995)'])


In [None]:
def recommend(movie_title, cosine_sim=cosine_sim):
    # Get index of the movie
    idx = indices[movie_title]
    
    # Get similarity scores for all movies with this one
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 10 similar movies (skip first one, which is the movie itself)
    sim_scores = sim_scores[1:11]
    
    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return top 10 movie titles
    return movies['title'].iloc[movie_indices]


In [None]:
recommend("Toy Story (1995)")


In [None]:
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings.head()


In [None]:
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings.head()


In [None]:
user_movie_matrix = movie_ratings.pivot_table(
    index='userId',
    columns='title',
    values='rating'
)
user_movie_matrix.head()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

movie_similarity = cosine_similarity(
    user_movie_matrix.fillna(0).T
)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

movie_similarity = cosine_similarity(
    user_movie_matrix.fillna(0).T
)


In [None]:
movie_sim_df = pd.DataFrame(
    movie_similarity,
    index=user_movie_matrix.columns,
    columns=user_movie_matrix.columns
)


In [None]:
def recommend_by_ratings(movie_title, n=10):
    if movie_title not in movie_sim_df:
        return "Movie not found"

    similar_scores = movie_sim_df[movie_title].sort_values(ascending=False)
    return similar_scores.iloc[1:n+1]


In [None]:
recommend_by_ratings("Toy Story (1995)")


In [None]:
def hybrid_recommend(movie_title, n=10, alpha=0.5):
    """
    alpha = weight between content-based and collaborative
    0.5 = balanced
    """
    if movie_title not in indices or movie_title not in movie_sim_df:
        return "Movie not found"

    # Content-based scores
    idx = indices[movie_title]
    content_scores = pd.Series(
        cosine_sim[idx],
        index=movies['title']
    )

    # Collaborative scores
    collab_scores = movie_sim_df[movie_title]

    # Combine
    final_scores = (alpha * content_scores) + ((1 - alpha) * collab_scores)

    return final_scores.sort_values(ascending=False).iloc[1:n+1]


In [None]:
hybrid_recommend("Toy Story (1995)")
