# 🧠 Content-Based Recommendation System - TF-IDF Version

## ⚙️ 1. Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 2. Load Data

In [2]:
df = pd.read_csv("../data/processed/enriched_movies_clean.csv")
df.head()

Unnamed: 0,movieId,genres,overview,popularity,poster_path,release_date,title,tmdb_id,vote_average,release_year,n_genres,overview_length,release_decade
0,6,"['Action', 'Crime', 'Thriller']",The Public Enemy battle The Gangstas in a Stee...,0.2205,/4miPIzrKBaSznoTBv1bS0sZwoMG.jpg,1995-07-15,Heat (1995),706330,0.0,1995.0,3,290,1990.0
1,34,"[""Children's"", 'Comedy', 'Drama']",Babe is a little pig who doesn't quite know hi...,4.5272,/zKuQMtnbVTz9DsOnOJmlW71v4qH.jpg,1995-07-18,Babe (1995),9598,6.244,1995.0,3,383,1990.0
2,50,"['Crime', 'Thriller']","Held in an L.A. interrogation room, Verbal Kin...",7.9936,/rWbsxdwF9qQzpTPCLmDfVnVqTK1.jpg,1995-07-19,"Usual Suspects, The (1995)",629,8.175,1995.0,2,409,1990.0
3,1,"['Animation', ""Children's"", 'Comedy']","Led by Woody, Andy's toys live happily in his ...",21.8546,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,1995-11-22,Toy Story (1995),862,7.968,1995.0,3,303,1990.0
4,2,"['Adventure', ""Children's"", 'Fantasy']",When siblings Judy and Peter discover an encha...,3.1183,/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg,1995-12-15,Jumanji (1995),8844,7.237,1995.0,3,395,1990.0


## 3. Preprocess Overview Text

In [3]:
df["overview_clean"] = df["overview"].fillna("").str.lower()

## 4. TF-IDF Vectorization

In [4]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["overview_clean"]) 

## 5. Cosine Similarity

In [5]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

## 6. Recommend Function

In [6]:
def get_recommendations(title, top_n=5):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices]["title"].tolist()

In [7]:
get_recommendations("Toy Story (1995)")

['Toy Story 2 (1999)',
 'Rebel Without a Cause (1955)',
 'Condorman (1981)',
 'Malice (1993)',
 'Man on the Moon (1999)']