## 1. Import & Load Dataset

In [2]:
import sys
import os
sys.path.append(os.path.abspath("../"))

import pandas as pd
from src.data_loader import load_data

df = load_data("../data/TMDB_movies.csv")
df = df.reset_index(drop=True)

In [13]:
# Data Handling
import pandas as pd
import numpy as np

# Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity Calculation
from sklearn.metrics.pairwise import cosine_similarity

## 2. Preprocess Text Columns

In [14]:
# Fill missing overviews
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genre'].fillna('')

# Combine overview + genre to improve recommendations
df['content'] = df['overview'] + ' ' + df['genres']

## 3. TF-IDF Vectorization

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Limit max features to prevent memory overload
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')

tfidf_matrix = tfidf.fit_transform(df['content'])

In [19]:
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (9985, 10000)


In [20]:
print(tfidf.get_feature_names_out()[:1000])  # First 20 words

['000' '007' '10' '100' '11' '117' '11th' '12' '12th' '13' '13th' '14'
 '14th' '15' '150' '15th' '16' '16th' '17' '17th' '18' '1890s' '18th' '19'
 '1900' '1905' '1920' '1920s' '1927' '1930s' '1933' '1936' '1937' '1939'
 '1940' '1940s' '1941' '1942' '1943' '1944' '1945' '1947' '1949' '1950'
 '1950s' '1955' '1956' '1957' '1958' '1959' '1960' '1960s' '1962' '1963'
 '1964' '1965' '1966' '1967' '1968' '1969' '1970' '1970s' '1971' '1972'
 '1973' '1974' '1976' '1977' '1978' '1979' '1980' '1980s' '1981' '1982'
 '1983' '1984' '1985' '1986' '1987' '1988' '1989' '1990' '1990s' '1991'
 '1992' '1993' '1995' '1996' '1999' '19th' '20' '200' '2001' '2003' '2006'
 '2008' '2009' '2010' '2011' '2013' '2015' '2019' '2029' '20s' '20th' '21'
 '21st' '22' '22nd' '23' '24' '25' '26' '27' '28' '29' '30' '300' '30s'
 '30th' '34' '35' '39' '3d' '40' '400' '47' '48' '50' '500' '50s' '51'
 '60' '60s' '70' '70s' '72' '7th' '80' '80s' '90' '90s' 'aaron' 'abahachi'
 'abandon' 'abandoned' 'abandoning' 'abandons' 'abbe

## 4. Compute Cosine Similarity

In [21]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

## 5. Recommendation Function

In [24]:
# Build reverse index
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

def recommend_content_based(title, top_n=5):
    title = title.lower()

    if title not in indices:
        return f"Movie '{title}' not found in dataset."

    # Get the index of the movie
    idx = indices[title]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Get indices of top N similar movies (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Return top N similar movies with key detail
    recommended_indices = [i[0] for i in sim_scores]
    return df[['title', 'genre', 'vote_average', 'popularity']].iloc[recommended_indices]

## 6. Test the Recommender

In [25]:
recommend_content_based("Inception", top_n=5)

Unnamed: 0,title,genre,vote_average,popularity
1729,New World,"thriller,crime,drama",7.4,10.33
2730,Suicide Squad: Hell to Pay,"science fiction,action,animation",7.1,40.675
361,What Ever Happened to Baby Jane?,"drama,horror,thriller",8.0,10.735
8274,Stowaway,"science fiction,drama,thriller,adventure",5.9,23.376
4634,Cypher,"thriller,science fiction",6.7,7.298
