## 1. Imports

In [6]:
import sys
import os
sys.path.append(os.path.abspath("../"))

import pandas as pd
import numpy as np

# For scaling + similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## 2. Load Data & Preprocess

In [7]:
from src.data_loader import load_data

df = load_data("../data/TMDB_movies.csv")
df = df.reset_index(drop=True)

# Preprocess columns
df['overview'] = df['overview'].fillna('')
df['genre'] = df['genre'].fillna('')
df['content'] = df['overview'] + " " + df['genre']

## 3. Content-Based Setup

In [8]:
# TF-IDF
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf.fit_transform(df['content'])

# Cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

## 4. Collaborative Filtering Setup

In [9]:
from sklearn.preprocessing import MinMaxScaler

features = df[['vote_average', 'vote_count', 'popularity']].fillna(0)

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(scaled_features)


## 5. Hybrid Recommendation Function

In [12]:
# Index mapping
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

def hybrid_recommend(title, top_n=5):
    title = title.lower()
    if title not in indices:
        return f"❌ Movie '{title}' not found in dataset."

    idx = indices[title]

    ## Content-Based Similarity
    content_sim_scores = list(enumerate(cosine_sim[idx]))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)

    content_indices = [i[0] for i in content_sim_scores if i[0] != idx][:top_n]

    ## Collaborative-Based Similarity
    movie_vector = scaled_features[idx].reshape(1, -1)
    _, knn_indices = model_knn.kneighbors(movie_vector, n_neighbors=min(top_n + 1, len(df)))

    collaborative_indices = knn_indices.flatten()
    collaborative_indices = [i for i in collaborative_indices if i != idx]

    ## Combine
    combined_indices = list(set(content_indices) & set(collaborative_indices))

    if len(combined_indices) < top_n:
        combined_indices += list(set(content_indices + collaborative_indices) - set(combined_indices))
        combined_indices = combined_indices[:top_n]

    # ✅ Clip indices within valid DataFrame range
    combined_indices = [i for i in combined_indices if i < len(df)]

    return df[['title', 'genre', 'vote_average', 'popularity']].iloc[combined_indices]

## 6. Testing the Hybrid Recommender

In [13]:
hybrid_recommend("Interstellar", top_n=5)

Unnamed: 0,title,genre,vote_average,popularity
3329,Captain America: The First Avenger,"action,adventure,science fiction",7.0,118.488
3429,Stargate,"action,adventure,science fiction",7.0,19.66
5544,Prometheus,"science fiction,adventure,mystery",6.5,119.827
969,Iron Man,"action,science fiction,adventure",7.6,135.952
1162,Gattaca,"thriller,science fiction,mystery,romance",7.6,17.789
