In [1]:
# ──────────────────────────────────────────────────────────────
# 1. Imports & NLTK stopwords
# ──────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import ast
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords', quiet=True)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from fuzzywuzzy import process, fuzz

stop_words = set(stopwords.words('english'))

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# ──────────────────────────────────────────────────────────────
# 2. Load datasets
# ──────────────────────────────────────────────────────────────
tmdb_movies  = pd.read_csv('tmdb_5000_movies.csv')
tmdb_credits = pd.read_csv('tmdb_5000_credits.csv')
ml_movies    = pd.read_csv('movies.csv')      # title, genres, movieId
ml_ratings   = pd.read_csv('ratings.csv')    # userId, movieId, rating

print("Datasets loaded.")

Datasets loaded.


In [11]:
# ──────────────────────────────────────────────────────────────
# BLOCK 3 – AVERAGE RATING + COUNT (NO MORE KeyError)
# ──────────────────────────────────────────────────────────────
# Compute BOTH mean AND count per movie
avg_stats = (
    ml_ratings.groupby('movieId')['rating']
              .agg(['mean', 'count'])  # <-- BOTH
              .reset_index()
)

# Rename mean → avg_rating
avg_stats = avg_stats.rename(columns={'mean': 'avg_rating'})

# Round for readability
avg_stats['avg_rating'] = avg_stats['avg_rating'].round(2)

# Merge with ml_movies
ml_movies = ml_movies.merge(avg_stats, on='movieId', how='left')

# Fill missing ratings (cold-start movies)
ml_movies['avg_rating'] = ml_movies['avg_rating'].fillna(3.0)
ml_movies['count'] = ml_movies['count'].fillna(0).astype(int)

print("MovieLens avg ratings sample:")
print(ml_movies[['title', 'avg_rating', 'count']].head(5))

MovieLens avg ratings sample:
                                title  avg_rating  count
0                    Toy Story (1995)        3.92  49695
1                      Jumanji (1995)        3.21  22243
2             Grumpier Old Men (1995)        3.15  12735
3            Waiting to Exhale (1995)        2.86   2756
4  Father of the Bride Part II (1995)        3.06  12161


In [12]:
# ──────────────────────────────────────────────────────────────
# OPTIMIZED BLOCK 4 – FUZZY MERGE WITH CACHING
# ──────────────────────────────────────────────────────────────
import os
import pickle

# File to cache the merged dataframe
CACHE_FILE = 'merged_movies_cache.pkl'

def clean_title(t):
    t = re.sub(r'\s*\(\d{4}\)', '', t)
    t = re.sub(r'^(the|a|an)\s+', '', t, flags=re.IGNORECASE)
    return t.strip().lower()

# Check if cached version exists
if os.path.exists(CACHE_FILE):
    print(f"Loading cached merged data from {CACHE_FILE}...")
    with open(CACHE_FILE, 'rb') as f:
        merged = pickle.load(f)
    print(f"Loaded {len(merged)} movies from cache!")
else:
    print("No cache found. Running fuzzy merge (this will take time)...")
    
    tmdb_movies['title_clean'] = tmdb_movies['title'].apply(clean_title)
    ml_movies['title_clean'] = ml_movies['title'].apply(clean_title)
    
    # OPTIMIZATION 1: Build a dict for faster lookup
    ml_titles_dict = dict(zip(ml_movies['title_clean'], ml_movies.index))
    
    matches = []
    total = len(tmdb_movies)
    
    # OPTIMIZATION 2: Use tqdm for progress bar
    try:
        from tqdm import tqdm
        iterator = tqdm(tmdb_movies['title_clean'], desc="Matching titles")
    except ImportError:
        print("Install tqdm for progress bar: pip install tqdm")
        iterator = tmdb_movies['title_clean']
    
    for i, tmdb_title in enumerate(iterator):
        # Print progress every 100 movies if no tqdm
        if i % 100 == 0 and 'tqdm' not in str(type(iterator)):
            print(f"Progress: {i}/{total} ({i/total*100:.1f}%)")
        
        # OPTIMIZATION 3: Limit comparison to 100 best candidates
        best = process.extractOne(
            tmdb_title, 
            ml_movies['title_clean'].tolist(),
            scorer=fuzz.ratio,
            score_cutoff=85  # Skip if score < 85
        )
        
        if best:
            ml_idx = ml_movies[ml_movies['title_clean'] == best[0]].index[0]
            tmdb_idx = tmdb_movies[tmdb_movies['title_clean'] == tmdb_title].index[0]
            matches.append((tmdb_idx, ml_idx))
    
    print(f"\nFound {len(matches)} matches!")
    
    if matches:
        # Extract TMDB and MovieLens parts
        tmdb_part = tmdb_movies.iloc[[m[0] for m in matches]].reset_index(drop=True)
        ml_part = ml_movies.iloc[[m[1] for m in matches]][['genres', 'avg_rating', 'count']].reset_index(drop=True)
        
        # Rename MovieLens columns to avoid conflict
        ml_part = ml_part.rename(columns={
            'genres': 'genres_ml',
            'avg_rating': 'avg_rating_ml',
            'count': 'rating_count'
        })
        
        # Concatenate
        merged = pd.concat([tmdb_part, ml_part], axis=1)
        print(f"Merged {len(merged)} movies with REAL ratings!")
    else:
        merged = tmdb_movies.copy()
        merged['avg_rating_ml'] = 3.5
        merged['rating_count'] = 0
        merged['genres_ml'] = ''
    
    # SAVE TO CACHE
    print(f"Saving merged data to {CACHE_FILE}...")
    with open(CACHE_FILE, 'wb') as f:
        pickle.dump(merged, f)
    print("Cache saved! Next time this will load instantly.")

# Verify the data
print(f"\nFinal merged shape: {merged.shape}")
print(f"Columns: {list(merged.columns)}")
print(f"\nSample data:")
print(merged[['title', 'avg_rating_ml', 'rating_count']].head())

Loading cached merged data from merged_movies_cache.pkl...
Loaded 3736 movies from cache!

Final merged shape: (3736, 24)
Columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'title_clean', 'genres_ml', 'avg_rating_ml', 'rating_count']

Sample data:
                                      title  avg_rating_ml  rating_count
0                                    Avatar           3.78          9753
1  Pirates of the Caribbean: At World's End           3.39          5428
2                     The Dark Knight Rises           4.00          4770
3                               John Carter           3.28           625
4                              Spider-Man 3           3.01          4256


In [13]:
# ──────────────────────────────────────────────────────────────
# 5. Extract genres, keywords, top-3 cast, director
# ──────────────────────────────────────────────────────────────
def safe_list(col, key='name'):
    try:
        return ' '.join([i[key].lower().replace(' ','') for i in ast.literal_eval(col)])
    except:
        return ''

def get_director(crew):
    try:
        for p in ast.literal_eval(crew):
            if p['job']=='Director':
                return p['name'].lower().replace(' ','')
        return ''
    except:
        return ''

merged['genres_tmdb'] = merged['genres'].apply(lambda x: safe_list(x,'name'))
merged['keywords']    = merged['keywords'].apply(lambda x: safe_list(x,'name'))
merged['genres_ml']   = merged['genres_ml'].apply(lambda x: x.lower().replace('|',' '))

In [14]:
# ──────────────────────────────────────────────────────────────
# 6. Remove stop-words from overview
# ──────────────────────────────────────────────────────────────
def clean_overview(txt):
    if pd.isna(txt): return ''
    return ' '.join([w.lower() for w in str(txt).split() if w.lower() not in stop_words])

merged['overview'] = merged['overview'].apply(clean_overview)

In [15]:
print("merged shape:", merged.shape)
print("Any overview?", merged['overview'].notna().sum())
print("Sample overview:", merged['overview'].iloc[0] if len(merged) > 0 else "EMPTY")

merged shape: (3736, 25)
Any overview? 3736
Sample overview: 22nd century, paraplegic marine dispatched moon pandora unique mission, becomes torn following orders protecting alien civilization.


In [16]:
# ──────────────────────────────────────────────────────────────
# BLOCK 7 – FEATURE MATRIX (USE avg_rating_ml)
# ──────────────────────────────────────────────────────────────
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.sparse import hstack, csr_matrix

# 1. Overview
overview_text = merged['overview'].fillna('').replace('', 'plot unknown')
tfidf_overview = TfidfVectorizer(max_features=1000, stop_words='english', min_df=1)
overview_matrix = tfidf_overview.fit_transform(overview_text)

# 2. Keywords
keywords_text = merged['keywords'].fillna('').replace('', 'none')
tfidf_keywords = TfidfVectorizer(max_features=500)
keywords_matrix = tfidf_keywords.fit_transform(keywords_text)

# 3. Genres (TMDB + ML combined)
genre_list = (
    merged['genres_tmdb'].fillna('').str.split() + 
    merged['genres_ml'].fillna('').str.split()
)
mlb_genre = MultiLabelBinarizer()
genres_matrix = mlb_genre.fit_transform(genre_list)




# 6. Rating boost (USE avg_rating_ml)
scaler = MinMaxScaler()
rating_boost = scaler.fit_transform(merged[['avg_rating_ml']])  # FIXED
rating_boost = rating_boost * 2.0  # Strong boost

# 7. Combine
feature_matrix = hstack([
    overview_matrix * 1.0,
    keywords_matrix * 0.8,
    csr_matrix(genres_matrix) * 1.2,
    csr_matrix(rating_boost) * 1.5
])

print(f"Feature matrix ready: {feature_matrix.shape}")

Feature matrix ready: (3736, 1529)


In [17]:
# ──────────────────────────────────────────────────────────────
# BLOCK 8 – SIMILARITY + CORRECT INDICES
# ──────────────────────────────────────────────────────────────
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# CORRECT: Map lowercase title → index
indices = pd.Series(
    merged.index,
    index=merged['title'].str.lower().str.strip()
).drop_duplicates()

print(f"Indices built: {len(indices)} movies")

Indices built: 3736 movies


In [18]:
# ──────────────────────────────────────────────────────────────
# BLOCK 9 – RECOMMEND (FIXED)
# ──────────────────────────────────────────────────────────────
def get_recommendations(title, top_n=5):
    title = title.strip().lower()
    
    if title not in indices:
        # Fuzzy fallback
        matches = [idx for idx, t in indices.items() if title in t]
        if not matches:
            return f"Movie '{title}' not found. Try 'avatar', 'titanic', or 'inception'."
        idx = matches[0]
    else:
        idx = indices[title]
    
    sim_scores = sorted(enumerate(cosine_sim[idx]), key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    recs = []
    for i, score in sim_scores:
        movie_title = merged.iloc[i]['title']
        rating = merged.iloc[i].get('avg_rating', 3.0)
        recs.append(f"• **{movie_title}** (Rating: {rating:.1f})")
    
    return f"Recommendations for **{merged.iloc[idx]['title']}**:\n" + "\n".join(recs)

In [19]:
# ──────────────────────────────────────────────────────────────
# ENHANCED CHATBOT: NATURAL LANGUAGE + FUZZY
# ──────────────────────────────────────────────────────────────
import re
from fuzzywuzzy import fuzz

def parse_query(message):
    msg = message.lower().strip()
    
    # 1. Extract movie title (after "like", "similar to", etc.)
    patterns = [
        r'(?:like|similar to|recommend|show me)\s+([^\.!,?]+)',
        r'^(?!.*\b(action|comedy|drama|sci-fi|horror)\b)(.+?)(?:\s+movie|\s+film)?$'
    ]
    title = None
    for p in patterns:
        match = re.search(p, msg)
        if match:
            title = match.group(1).strip()
            break
    
    # 2. Detect genre
    genres = ['action', 'comedy', 'drama', 'sci-fi', 'horror', 'romance', 'thriller']
    detected_genre = next((g for g in genres if g in msg), None)
    
    return title, detected_genre

def fuzzy_find_title(partial_title, threshold=70):
    partial = partial_title.lower()
    for idx, full_title in indices.items():
        if fuzz.partial_ratio(partial, full_title.lower()) >= threshold:
            return idx
    return None

def chatbot(message, history):
    title_query, genre_query = parse_query(message)
    
    # CASE 1: Genre-only request
    if genre_query and not title_query:
        genre_movies = merged[merged['genres_tmdb'].str.contains(genre_query, case=False, na=False)]
        if genre_movies.empty:
            return f"No {genre_query} movies found."
        sample_title = genre_movies.iloc[0]['title']
        return f"Top {genre_query.title()} pick: **{sample_title}**\n\n" + get_recommendations(sample_title)
    
    # CASE 2: Movie title (fuzzy)
    if title_query:
        idx = fuzzy_find_title(title_query)
        if not idx:
            return f"Couldn't find '{title_query}'. Try 'Inception', 'Titanic', or 'Avatar'."
        movie_title = merged.iloc[idx]['title']
        return get_recommendations(movie_title)
    
    # CASE 3: Fallback
    return "Try: 'Movies like Inception', 'Recommend action', or 'Sci-fi like Matrix'"

# LAUNCH
gr.ChatInterface(
    chatbot,
    title="Smart Movie Recommender",
    description="Say: 'Like Inception', 'Action movies', 'Sci-fi like Matrix'",
    examples=[
        "Movies like Inception",
        "Recommend action movies",
        "Sci-fi like The Matrix",
        "Titanik",
        "Incept"
    ],
    theme=gr.themes.Soft()
).launch(share=True)

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://546c59aba1ad0ea081.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "C:\Users\one\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\gradio\queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\one\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\gradio\route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\one\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\gradio\blocks.py", line 2116, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\one\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCach

In [15]:
print(get_recommendations("Titanic"))

Recommendations for **Titanic**:
• **My Summer of Love** (Rating: 3.0)
• **Cruel Intentions** (Rating: 3.0)
• **Fatal Attraction** (Rating: 3.0)
• **O** (Rating: 3.0)
• **Angel Eyes** (Rating: 3.0)
