In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Step 1: Load the dataset
df = pd.read_csv("tmdb_5000_movies.csv")
df = df[["title", "overview"]].dropna()


In [2]:
# Step 2: Preprocess and vectorize text using TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["overview"])

In [3]:
# Step 3: Define different similarity methods
def calculate_similarity(method="cosine", matrix=tfidf_matrix):
    if method == "cosine":
        return cosine_similarity(matrix, matrix)
    elif method == "euclidean":
        return -euclidean_distances(matrix, matrix)  # Negative for compatibility (higher = more similar)
    elif method == "jaccard":
        # Convert sparse matrix to binary for Jaccard Similarity
        bin_matrix = (matrix > 0).astype(int)
        intersection = np.dot(bin_matrix, bin_matrix.T)
        row_sums = bin_matrix.sum(axis=1)
        union = row_sums + row_sums.T - intersection
        return intersection / union  # Jaccard Similarity
    else:
        raise ValueError(f"Unknown method: {method}")

# Default: Cosine Similarity
similarity_method = "cosine"
similarity_matrix = calculate_similarity(method=similarity_method)

In [4]:
# Step 4: Define a function to recommend movies
def recommend_movies(title, similarity_matrix=similarity_matrix, df=df, top_n=5):
    if title not in df['title'].values:
        return f"Movie '{title}' not found in the dataset."
    
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort movies by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of the top_n most similar movies (excluding itself)
    sim_indices = [i[0] for i in sim_scores[1:top_n + 1]]
    return df['title'].iloc[sim_indices]

In [6]:
# Step 5: Define a function to display recommendations
def display_movies(input_movie, similarity_methods=["cosine", "euclidean", "jaccard"]):
    for method in similarity_methods:
        print(f"\nUsing {method} similarity:")
        similarity_matrix = calculate_similarity(method=method)
        recommended_movies = recommend_movies(input_movie, similarity_matrix=similarity_matrix)
        if isinstance(recommended_movies, str):  # Handle movie not found
            print(recommended_movies)
        else:
            print(recommended_movies.tolist())

In [8]:
display_movies("Avatar")


Using cosine similarity:
['Apollo 18', 'The American', 'The Matrix', 'The Inhabited Island', 'Tears of the Sun']

Using euclidean similarity:
['The Helix... Loaded', 'Apollo 18', 'The American', 'The Matrix', 'The Inhabited Island']

Using jaccard similarity:
[]


In [9]:
display_movies("Star Wars")


Using cosine similarity:
['Friday the 13th Part VI: Jason Lives', 'The Ultimate Gift', 'Big Fat Liar', 'Friday the 13th Part VIII: Jason Takes Manhattan', 'Jason Goes to Hell: The Final Friday']

Using euclidean similarity:
['The Helix... Loaded', 'Friday the 13th Part VI: Jason Lives', 'The Ultimate Gift', 'Big Fat Liar', 'Friday the 13th Part VIII: Jason Takes Manhattan']

Using jaccard similarity:
[]
