In [None]:
# --- CELL 1: SETUP, BRAIN, AND DEBUG UI ---
# This notebook cell defines a small content-based movie recommender system
# built on top of the TMDB 5000 dataset. It includes data loading and
# preprocessing steps, a "brain" class that computes vector representations
# and similarity scores, and a minimal ipywidgets-based UI for interactive
# querying. Comments are written in an academic and precise style and do not
# alter program logic.

import pandas as pd
import numpy as np
import ast
import difflib
import os
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import ipywidgets as widgets
from IPython.display import display, Markdown

# --- 1. CONFIGURATION ---
# File paths for the dataset CSVs. These should point to the TMDB 5000
# movies and credits CSV files. TOP_N_DEFAULT defines the default number of
# recommendations returned when the user does not request a specific count.
MOVIES_FILE = 'tmdb_5000_movies.csv'
CREDITS_FILE = 'tmdb_5000_credits.csv'
TOP_N_DEFAULT = 5

# CUSTOM_STOP_WORDS extends the default scikit-learn English stop word list
# with domain-specific tokens (e.g., 'movie', 'film') that are uninformative
# for content-based similarity in this dataset.
CUSTOM_STOP_WORDS = list(ENGLISH_STOP_WORDS) + ['movie', 'movies', 'film', 'films', 'cinema']


# --- 2. HELPERS ---
# The following helper functions perform small, well-scoped parsing tasks on
# the dataset columns which are stored as stringified JSON-like lists.

def convert(text):
    """
    Parse a string representation of a list of dictionaries and extract the
    'name' field from each dictionary. If parsing fails, return an empty list.
    Typical input: "[{'id': 28, 'name': 'Action'}, ...]".
    """
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
    except: pass
    return L


def convert3(text):
    """
    Similar to convert(), but limits the output to at most eight names.
    This is used to restrict cast lists so that extremely large casts do not
    dominate the text representation.
    """
    L = []
    counter = 0
    try:
        for i in ast.literal_eval(text):
            if counter < 8:
                L.append(i['name'])
                counter+=1
            else: break
    except: pass
    return L


def fetch_director(text):
    """
    Extract the director's name from a crew list encoded as a string.
    The function returns a single-element list containing the director's
    name if present. Returning as a list keeps the downstream pipeline
    consistent (other fields produce lists of names).
    """
    L = []
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
    except: pass
    return L


def collapse(L):
    """
    Helper to remove whitespace within tokens. Many downstream steps use
    space-free tokens (e.g., 'Tom Hanks' -> 'TomHanks') to create compact
    vocabulary items which are robust to tokenization differences.
    """
    return [i.replace(" ","") for i in L]


# --- 3. THE BRAIN (Level 8 with Debug Logs) ---
# The UniversalRecommender class encapsulates all logic for building the
# content-based recommendation model and for producing recommendations given
# a text query. Implementation notes:
# - Uses a simple CountVectorizer over concatenated textual 'tags'.
# - Combines cosine similarity with a normalized popularity-weighted score
#   (a Bayes-like weighted rating) for final ranking.
# - Includes a sliding-window parser and fuzzy matching to robustly map
#   user queries to vocabulary tokens.
class UniversalRecommender:
    def __init__(self):
        # Initialization message to indicate model-building activity.
        print("‚öôÔ∏è Initializing AI Brain... (10-20 seconds)")
        # Validate dataset presence; fail fast if files are missing.
        if not os.path.exists(MOVIES_FILE) or not os.path.exists(CREDITS_FILE):
            print(f"‚ùå Error: Files not found.")
            return

        # Load CSV files and merge on the 'title' column to create a single
        # dataframe with both movie metadata and credits information.
        movies = pd.read_csv(MOVIES_FILE)
        credits = pd.read_csv(CREDITS_FILE)
        self.df = movies.merge(credits, on='title')

        # Select the subset of columns required by the recommender pipeline.
        self.df = self.df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average', 'vote_count', 'popularity']]
        
        # Parse JSON-like genre strings into Python lists of genre names.
        self.df['genres'] = self.df['genres'].apply(convert)

        # Build a set of known genres (lowercased) used for quick detection of
        # genre-based user queries. This enables direct genre searches.
        self.known_genres = set()
        for g_list in self.df['genres']:
            for g in g_list:
                self.known_genres.add(g.lower())
        # Add common alternate genre synonyms to increase robustness.
        self.known_genres.update(['science fiction', 'sci-fi', 'rom-com'])

        # Parse keywords, cast, and crew using the helper functions defined
        # above. These produce lists of textual tokens used later in tag
        # construction.
        self.df['keywords'] = self.df['keywords'].apply(convert)
        self.df['cast'] = self.df['cast'].apply(convert3)
        self.df['crew'] = self.df['crew'].apply(fetch_director)

        # A user-friendly string showing genres for display in the UI.
        self.df['display_genres'] = self.df['genres'].apply(lambda x: ", ".join(x))

        # --- Weighted rating computation ---
        # Compute an aggregate 'weighted_score' that accounts for both the
        # average rating and the number of votes. This mirrors the IMDB-style
        # Bayesian estimate where movies with very few votes are penalized.
        C = self.df['vote_average'].mean()
        m = self.df['vote_count'].quantile(0.70)
        def weighted_rating(x, m=m, C=C):
            v = x['vote_count']
            R = x['vote_average']
            if v >= m: return (v/(v+m) * R) + (m/(m+v) * C)
            else: return 0

        self.df['weighted_score'] = self.df.apply(weighted_rating, axis=1)

        # Normalize the weighted score to the [0,1] interval so it can be
        # mixed with similarity scores during ranking.
        scaler = MinMaxScaler()
        self.df['normalized_weight'] = scaler.fit_transform(self.df[['weighted_score']])

        # Clean and collapse tokens for consistent tokenization in the vector
        # space. This removes spaces inside multi-word names, e.g., 'Tom Hanks'
        # -> 'TomHanks', which ensures they are treated as single tokens.
        self.df['cast_clean'] = self.df['cast'].apply(collapse)
        self.df['crew_clean'] = self.df['crew'].apply(collapse)
        self.df['genres_clean'] = self.df['genres'].apply(collapse)
        self.df['keywords_clean'] = self.df['keywords'].apply(collapse)

        # Tokenize the overview into words (simple whitespace tokenization).
        # If the overview is missing or not a string, use an empty list.
        self.df['overview_clean'] = self.df['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

        # Construct a compact 'tags' field used as the textual representation
        # for each movie. The design choices here intentionally weight certain
        # fields (genres and cast) more heavily by duplicating them.
        self.df['tags'] = self.df['overview_clean'] + (self.df['genres_clean'] * 2) + self.df['keywords_clean'] + (self.df['cast_clean'] * 2) + self.df['crew_clean']
        self.df['tags'] = self.df['tags'].apply(lambda x: " ".join(x))
        
        # Initialize the CountVectorizer using the extended stop-word list.
        # The resulting vocabulary will be used for fast BoW vectorization.
        self.cv = CountVectorizer(stop_words=CUSTOM_STOP_WORDS) 
        self.vectors = self.cv.fit_transform(self.df['tags']).toarray()
        self.vocab = list(self.cv.vocabulary_.keys())
        print(f"‚úÖ Model Ready. Learned {len(self.vocab)} unique tags.")

    def recommend(self, query: str, top_n: int = TOP_N_DEFAULT):
        """
        Produce up to top_n movie recommendations for an arbitrary textual
        query. Returns a tuple: (results, logs). 'results' is a list of
        recommendation dicts; 'logs' is a sequence of debug strings that
        describe how the query was interpreted.

        The method implements a robust pipeline that:
        - Detects explicit genres and exact titles
        - Applies fuzzy matching to fix typos or map user words to vocabulary
        - Uses a sliding-window (trigram/bigram/unigram) parser to detect
          multi-word entities (e.g., actor names)
        - Computes cosine similarity in BoW space and combines it with the
          normalized popularity-weight to produce final scores.
        """
        try:
            query_lower = query.lower().strip()
            glued_query = query_lower.replace(" ","")
            logs = [] # Store debug messages here
            
            # --- GENRE CHECK ---
            # If the user query exactly matches a known genre (or a cleaned
            # variant), we record that fact in the logs and allow downstream
            # processing (the actual genre-handling logic may be expanded
            # here in future revisions).
            clean_check = query_lower.replace("movies", "").replace("movie", "").strip()
            if query_lower in self.known_genres or clean_check in self.known_genres:
                logs.append(f"üîé Genre Detected: '{clean_check}'")
            else:
                # Build a mapping from lowercased titles to dataframe indices
                # for efficient exact-title detection.
                title_map = pd.Series(self.df.index, index=self.df['title'].apply(lambda x: x.lower()))
                
                # 1. Exact Title: If user provided a precise title, return
                # recommendations based on content-similarity to that movie.
                if query_lower in title_map:
                    idx = title_map[query_lower]
                    if isinstance(idx, pd.Series): idx = idx.iloc[0]
                    return self._fetch_results(idx, query_lower, top_n), logs

                # 2. Known Entity: If the concatenated query exists in the
                # vocabulary (e.g., 'tomhanks'), treat it as a recognized token.
                if glued_query in self.vocab:
                    logs.append(f"üåü Known Entity: '{glued_query}'")
                else:
                    # 3. Fuzzy Title: Attempt to find a close movie title using
                    # difflib; this helps when the user makes minor spelling
                    # errors but intends a movie title.
                    all_titles = self.df['title'].tolist()
                    closest_titles = difflib.get_close_matches(query, all_titles, n=1, cutoff=0.85)
                    if closest_titles:
                        match = closest_titles[0]
                        if match.lower() not in self.known_genres:
                            logs.append(f"üîÑ Fuzzy Title: '{match}'")
                            match_lower = match.lower()
                            if match_lower in title_map:
                                idx = title_map[match_lower]
                                if isinstance(idx, pd.Series): idx = idx.iloc[0]
                                return self._fetch_results(idx, match_lower, top_n), logs

            # --- SLIDING WINDOW PARSER ---
            # The sliding window attempts to detect multi-word vocabulary items
            # by concatenating adjacent words (trigram, bigram) and checking
            # for close matches in the pre-computed vocabulary. This increases
            # robustness to names like 'keanu reeves' or 'al pacino'.
            parts = query_lower.split()
            final_search_terms = []
            i = 0
            while i < len(parts):
                # Try trigram detection first (3-word sequences). We use a high
                # cutoff to avoid false positives.
                if i + 2 < len(parts):
                    trigram = parts[i] + parts[i+1] + parts[i+2]
                    closest = difflib.get_close_matches(trigram, self.vocab, n=1, cutoff=0.9)
                    if closest:
                        logs.append(f"üß© Detected Actor (3-word): {closest[0]}")
                        final_search_terms.extend([closest[0]] * 3); i += 3; continue
                # Next, attempt bigram detection (2-word sequences).
                if i + 1 < len(parts):
                    bigram = parts[i] + parts[i+1]
                    closest = difflib.get_close_matches(bigram, self.vocab, n=1, cutoff=0.9)
                    if closest:
                        logs.append(f"üß© Detected Actor (2-word): {closest[0]}")
                        final_search_terms.extend([closest[0]] * 3); i += 2; continue
                    # Allow looser fuzzy matching for bigrams to correct simple typos.
                    closest_fuzzy = difflib.get_close_matches(bigram, self.vocab, n=1, cutoff=0.7)
                    if closest_fuzzy:
                        logs.append(f"üîß Fixed Bigram Typo: '{bigram}' -> '{closest_fuzzy[0]}'")
                        final_search_terms.extend([closest_fuzzy[0]] * 3); i += 2; continue

                # Fall back to unigram processing. If the word exists in the
                # vocabulary, include it; if it looks like a stop word, skip it;
                # otherwise attempt a fuzzy correction.
                word = parts[i]
                if word in self.vocab: final_search_terms.append(word)
                elif word in CUSTOM_STOP_WORDS: pass
                else:
                    closest = difflib.get_close_matches(word, self.vocab, n=1, cutoff=0.6)
                    if closest: 
                        logs.append(f"üîß Fixed Typo: {word} -> {closest[0]}")
                        final_search_terms.append(closest[0])
                i += 1
            
            combined_query = " ".join(final_search_terms)
            
            # --- SHOW THE FINAL VECTOR ---
            logs.append(f"üìù Final Vector: '{combined_query}'")
            
            # If query reduces to an empty vector after cleaning, return empty
            # results and the log explaining why.
            if not combined_query.strip(): return [], logs

            # Transform the combined query into the bag-of-words space and
            # compute cosine similarity against all movie tag vectors.
            search_vector = self.cv.transform([combined_query])
            sim_scores = cosine_similarity(search_vector, self.vectors).flatten()
            pop_scores = self.df['normalized_weight'].values
            
            # Final score is a weighted combination of content similarity and
            # normalized popularity. The (sim_scores > 0) multiplier zeroes
            # out movies that have no token overlap with the query.
            final_scores = (sim_scores * 0.8) + (pop_scores * 0.2)
            final_scores = final_scores * (sim_scores > 0)

            sorted_indices = final_scores.argsort()[::-1]
            
            # Collect top-n results where final score is positive.
            results = []
            count = 0
            for idx in sorted_indices:
                if final_scores[idx] > 0:
                    results.append({
                        'title': self.df.iloc[idx].title,
                        'genres': self.df.iloc[idx].display_genres,
                        'score': float(final_scores[idx])
                    })
                    count += 1
                if count >= top_n: break
            return results, logs

        except Exception as e:
            # On exception, return an empty result list and a single-element
            # log describing the error for debugging in interactive use.
            return [], [f"Error: {e}"]

    def _fetch_results(self, idx, exclude_title, top_n):
        """
        Given a movie index, compute the most similar movies based on the
        precomputed tag vectors. The exclude_title parameter avoids
        recommending the same movie as the user query.
        """
        search_vector = self.vectors[idx].reshape(1, -1)
        sim_scores = cosine_similarity(search_vector, self.vectors).flatten()
        sorted_indices = sim_scores.argsort()[::-1]
        results = []
        count = 0
        for i in sorted_indices:
            if self.df.iloc[i].title.lower() == exclude_title: continue
            results.append({
                'title': self.df.iloc[i].title,
                'genres': self.df.iloc[i].display_genres,
                'score': float(sim_scores[i])
            })
            count += 1
            if count >= top_n: break
        return results


# --- 4. INITIALIZE ---
# Construct the recommender instance. This will load and preprocess the data
# and print a ready message when complete. The printed message helps the user
# understand that the potentially long initialization step finished.
bot = UniversalRecommender()


# --- 5. JUPYTER UI ---
# Create a minimal interactive UI using ipywidgets. The UI comprises a text
# input box, a search button, and an output area. The run_search callback
# invokes the recommender and renders both debug logs and results.
input_box = widgets.Text(
    value='',
    placeholder='Try: Keenu Reaves, Al Pachino, Sci-Fi...',
    description='Search:',
    layout=widgets.Layout(width='600px'),
    continuous_update=False
)
search_btn = widgets.Button(description="Recommend üçø")
output_area = widgets.Output()


def run_search(_ignore=None):
    """
    Callback to handle search events. The function prints a small progress
    message, calls the recommend() method, then clears and displays the
    debug log and the top recommendations in a human-readable form.
    """
    with output_area:
        output_area.clear_output()
        query = input_box.value
        if not query: return
        
        print(f"Thinking about '{query}'...")
        results, logs = bot.recommend(query, top_n=5)
        
        output_area.clear_output()
        
        # --- DISPLAY DEBUG LOGS ---
        if logs:
            display(Markdown("**ü§ñ AI Debug Log:**"))
            for log in logs:
                print(log) # This prints the vector and typos
            print("-" * 60)
        
        # --- DISPLAY RESULTS ---
        if not results:
            print(f"‚ùå No matches found for '{query}'.")
        else:
            display(Markdown(f"### Top Recommendations for **{query}**:"))
            for r in results:
                # Create a simple ASCII progress bar to visualize the score.
                score_bar = "‚ñì" * int(r['score']*20) + "‚ñë" * (20 - int(r['score']*20))
                display(Markdown(f"**üé¨ {r['title']}**"))
                print(f"Genres: {r['genres']}")
                print(f"Match:  {int(r['score']*100)}% | {score_bar}")
                print("\n")

# Wire UI controls to the callback functions and display them in the
# notebook. The input box also triggers searches when its value changes.
search_btn.on_click(run_search)
input_box.observe(run_search, names='value')

# Render the UI controls and the output area for interactive use.
display(widgets.HBox([input_box, search_btn]))
display(output_area)

# End of annotated module. The code and logic above remain unchanged; only
# explanatory comments have been added to aid readability and maintainability.
