# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import ast # To safely convert stringified lists/dictionaries into Python objects
from nltk.stem.porter import PorterStemmer # For text normalization
from sklearn.feature_extraction.text import TfidfVectorizer # For weighted word counts (TF-IDF)
from sklearn.metrics.pairwise import cosine_similarity # To calculate movie similarity
from fuzzywuzzy import fuzz # For handling user input typos



# Extracting the 'name' value from each dictionary in a JSON string list.

In [2]:
def convert(text):
    L = []
    try:
        for item in ast.literal_eval(text):
            L.append(item['name'])
    except:
        return []
    return L

# Extracting the name of the director from the 'crew' JSON string list

In [3]:
def fetch_director(text):
    L = []
    try:
        for item in ast.literal_eval(text):
            if item['job'] == 'Director':
                L.append(item['name'])
                break 
    except:
        pass
    return L

# Removing spaces from within individual strings in a list 

In [4]:
def collapse(L):
    L1 = []
    for i in L:
        if isinstance(i, str):
            L1.append(i.replace(" ", ""))
    return L1

# Applying stemming to a string of text, reducing words to their root form

In [5]:
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

# Generating movie recommendations using cosine similarity and fuzzy title matching

In [6]:
def recommend_best(movie_title, df, similarity_matrix):

    # 1. Pre-processing input title
    search_title = movie_title.lower().strip()

    # 2. Performing Fuzzy Matching to find the closest match
    title_matches = []
    for title in df['title']:
        score = fuzz.ratio(search_title, title.lower())
        if score > 75: # Slightly stricter threshold for confidence
            title_matches.append((title, score))

    if not title_matches:
        print(f"Error: Movie '{movie_title}' not found in the dataset or no close match was detected.")
        return

    # Selecting the best match based on the highest score
    best_match_title = max(title_matches, key=lambda x: x[1])[0]
    
    # 3. Getting the index of the best matching movie
    try:
        index = df[df['title'] == best_match_title].index[0]
    except IndexError:
        print(f"Error: Could not find index for best match title: '{best_match_title}'")
        return

    # 4. Getting the distances, sort them 
    distances = sorted(list(enumerate(similarity_matrix[index])), reverse=True, key=lambda x: x[1])

    # 5. Printing the top 5 recommendations
    print(f"Recommendations for **{best_match_title}**:")
    for i in distances[1:6]:
        print(f" - {df.iloc[i[0]]['title']}")

# System Initialization and Feature Vector Build

In [7]:
# 1. Loading Data
try:
    movies = pd.read_csv("tmdb_5000_movies.csv")
    credits = pd.read_csv("tmdb_5000_credits.csv")
except FileNotFoundError:
    print("Error: Ensure data files are in the correct directory.")

In [8]:
# 2. Merging and Selecting Columns
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [9]:
# 3. Cleaning Missing Values and Duplicates
movies.dropna(inplace=True)
movies.drop_duplicates(inplace=True)

In [10]:
# 4. Applying JSON Parsing and Formatting
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert).apply(lambda x: x[0:3]) 
movies['crew'] = movies['crew'].apply(fetch_director)
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [11]:
# 5. Collapsing Spaces
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [12]:
# 6. Creating Weighted 'tags' column 
movies['tags'] = (
    movies['overview'] + 
    movies['keywords'] + 
    movies['cast'] + 
    movies['genres'] * 3 +  # Triple weight for genre
    movies['crew'] * 3     # Triple weight for director
)

In [13]:
# 7. Final DataFrame for Vectorization
new_df = movies[['movie_id', 'title', 'tags']].copy()
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [14]:
# 8. Applying Stemming to Normalize Tags
new_df['tags'] = new_df['tags'].apply(stem)

In [15]:
# 9. Vectorization (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new_df['tags']).toarray()

In [16]:
# 10. Computing Cosine Similarity
similarity = cosine_similarity(vector)

# Testing the recommendations :

In [17]:
print("\n--- Testing Best-in-Class Recommender ---")
recommend_best('Avatr', new_df, similarity) # Testing with a typo
print("-" * 30)
recommend_best('batman begines', new_df, similarity) # Testing with a typo/case
print("-" * 30)
recommend_best('The Terminator', new_df, similarity) 
print("-" * 30)
recommend_best('The Social Network', new_df, similarity)


--- Testing Best-in-Class Recommender ---
Recommendations for **Avatar**:
 - Aliens
 - The Abyss
 - Terminator 2: Judgment Day
 - True Lies
 - The Terminator
------------------------------
Recommendations for **Batman Begins**:
 - The Dark Knight
 - The Dark Knight Rises
 - The Prestige
 - Insomnia
 - Batman
------------------------------
Recommendations for **The Terminator**:
 - The Inkwell
 - Barry Munday
 - Beginners
 - Without Men
 - Divine Secrets of the Ya-Ya Sisterhood
------------------------------
Recommendations for **The Social Network**:
 - Gone Girl
 - The Girl with the Dragon Tattoo
 - The Curious Case of Benjamin Button
 - Panic Room
 - Fight Club
