In [45]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
# pandas: Used for data manipulation and reading the CSV.

# ast: Used to safely parse stringified lists (like genre lists).

# TfidfVectorizer: Converts text to a matrix of TF-IDF features.

# cosine_similarity: Measures similarity between two text vectors.

In [46]:
# ✅ STEP 1: Load the dataset (CSV file)
# Replace this with your uploaded file name/path
df = pd.read_csv("movies.csv")
print("✅ STEP 1: Data loaded. Number of rows:", len(df))
# Loads your dataset (likely your full movie metadata) from a CSV file into a pandas DataFrame called df.



✅ STEP 1: Data loaded. Number of rows: 4803


In [47]:
# ✅ STEP 2: Remove rows that don't have essential info like title, overview or genres
df = df.dropna(subset=["title", "overview", "genres"])

print("✅ STEP 2: Null entries dropped. Remaining rows:", len(df))
# Ensures the essential fields (title, overview, and genres) are present.

# Removes any row with missing values in those fields.

✅ STEP 2: Null entries dropped. Remaining rows: 4800


In [48]:
# ✅ STEP 3: Convert 'genres' column from string to a readable list
# This function turns '[{"id": 28, "name": "Action"}]' → "Action"
def parse_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)  # Converts string to Python list
        return " ".join([genre["name"] for genre in genres])  # Keep only genre names
    except:
        return ""  # Return empty if conversion fails

# Apply genre parsing to every row
df["genres_parsed"] = df["genres"].apply(parse_genres)
print("✅ STEP 3: Sample parsed genres:\n", df["genres_parsed"].head())

# Converts the genres field from string to a readable format (just genre names).

# E.g., "[{"id": 28, "name": "Action"}]" → "Action"

✅ STEP 3: Sample parsed genres:
 0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
2                      Action Adventure Crime
3                 Action Crime Drama Thriller
4            Action Adventure Science Fiction
Name: genres_parsed, dtype: object


In [49]:
# ✅ STEP 4: Combine 'overview' and 'genres' into a single column for processing
# This gives more context to the recommendation engine
df["metadata"] = df["overview"] + " " + df["genres_parsed"]
print("✅ STEP 4: Sample metadata:\n", df["metadata"].head())

# Merges the movie description and genre into one column called metadata.

# This combined column is what your model will use to recommend similar movies.

✅ STEP 4: Sample metadata:
 0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: metadata, dtype: object


In [50]:
# ✅ STEP 5: Use TF-IDF to convert text into numerical format
# TF-IDF helps understand which words are important by comparing across all movies
tfidf = TfidfVectorizer(stop_words="english")  # Remove common English words like "the", "and"
tfidf_matrix = tfidf.fit_transform(df["metadata"])  # Transform text into vector


print("✅ STEP 5: TF-IDF matrix shape:", tfidf_matrix.shape)
# TfidfVectorizer: Converts text into numbers using the TF-IDF (Term Frequency–Inverse Document Frequency) technique.

# stop_words='english': Removes common words like “the”, “and”, “is” etc., to focus on meaningful words.

# fit_transform(...): Fits the vectorizer to your metadata column and transforms each movie’s text into a vector of numbers.

# 🔍 Why TF-IDF?

# It gives more weight to unique words and reduces weight for frequent/common ones.

# For example, "alien" in a sci-fi movie is more meaningful than "the".


# "action space alien"
# [action: 0.7, space: 0.6, alien: 0.8, ...rest: 0.0]
# It is a sparse matrix of shape (num_movies, num_unique_words).

# Each row is a numeric vector representation of a movie's combined metadata.

✅ STEP 5: TF-IDF matrix shape: (4800, 20978)


In [51]:
# ✅ STEP 6: Calculate cosine similarity between all movies
# This gives us how "similar" any two movies are based on metadata
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Calculates pairwise similarity between all movie vectors using cosine similarity.

# It creates a big square matrix (n x n), where each cell [i][j] gives similarity between movie i and movie j.

# 📌 Cosine similarity = 1 means exactly similar, 0 means no similarity.

In [52]:
# ✅ STEP 7: Create a way to look up a movie's index by its title
# We'll use this to fetch recommendations later
df.reset_index(inplace=True)  # Ensure indices are correct
indices = pd.Series(df.index, index=df["title"]).drop_duplicates()

# reset_index() gives every row a clean numeric index.

# indices: A lookup map from lowercase title to DataFrame index.

# indices['avatar'] → 0
# indices['spectre'] → 2


In [53]:
# ✅ STEP 8: Define function to recommend movies
from rapidfuzz import process  # Fast fuzzy matching

# ✅ Function to find closest movie title with typo tolerance
def get_closest_title(input_title, all_titles):
    print(f"\n🧠 Matching input title: '{input_title}' with fuzzy logic...")
    match = process.extractOne(input_title, all_titles, score_cutoff=60)
    
    if match:
        print(f"✅ Closest match found: '{match[0]}' with similarity score: {match[1]}")
        return match[0]
    
    print("❌ No close match found above threshold (60).")
    return None

# ✅ Recommendation function with logs
def recommend_movies(input_title, num_recommendations=5):
    print("\n🎬 Starting movie recommendation...")
    
    # Step 1: Get closest title using fuzzy matching
    actual_title = get_closest_title(input_title, df["title"])
    
    if not actual_title:
        return [f"❌ Movie not found: '{input_title}'. Try a different name."]

    # Step 2: Get index of the matched movie
    idx = indices[actual_title]
    print(f"🔢 Index of matched movie '{actual_title}' is: {idx}")
    
    # Step 3: Calculate similarity scores with all other movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    print(f"📊 Calculated cosine similarity for {len(sim_scores)} movies.")

    # Step 4: Sort similarity scores in descending order and skip the first (same movie)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    print("📈 Top similar movie scores (excluding input movie):")
    for i, score in sim_scores:
        print(f"    - {df['title'].iloc[i]} (score: {score:.4f})")

    # Step 5: Extract recommended movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Step 6: Return top recommended movie titles
    recommendations = df["title"].iloc[movie_indices].tolist()
    
    print(f"\n✅ Recommendations for '{actual_title}': {recommendations}")
    return recommendations





# Takes a movie title and finds its index.

# Gets its cosine similarity vector (its similarity with all other movies).

# Sorts the results and picks top 5 similar movies.

# Returns their titles.

# 🛡️ Note: [1:6] skips the first one (it’s the movie itself).

In [54]:
# ✅ STEP 9: Try the recommendation engine
movie_title = "spectre"  # Change this to any movie in your dataset
print(f"\n🎬 Top recommendations similar to '{movie_title}':")
recommendations = recommend_movies(movie_title)
for i, movie in enumerate(recommendations, 1):
    print(f"{i}. {movie}")


🎬 Top recommendations similar to 'spectre':

🎬 Starting movie recommendation...

🧠 Matching input title: 'spectre' with fuzzy logic...
✅ Closest match found: 'Spectre' with similarity score: 85.71428571428572
🔢 Index of matched movie 'Spectre' is: 2
📊 Calculated cosine similarity for 4800 movies.
📈 Top similar movie scores (excluding input movie):
    - Never Say Never Again (score: 0.2621)
    - From Russia with Love (score: 0.2108)
    - Thunderball (score: 0.1989)
    - Quantum of Solace (score: 0.1784)
    - Safe Haven (score: 0.1653)

✅ Recommendations for 'Spectre': ['Never Say Never Again', 'From Russia with Love', 'Thunderball', 'Quantum of Solace', 'Safe Haven']
1. Never Say Never Again
2. From Russia with Love
3. Thunderball
4. Quantum of Solace
5. Safe Haven


In [55]:
with open("recommender.pkl", "wb") as f:
    pickle.dump((df, cosine_sim, indices), f)

print("✅ Pickled model saved as recommender.pkl")

✅ Pickled model saved as recommender.pkl


In [44]:
# Cosine Similarity is a way to measure how similar two things are, based on the angle between them — not their length.

# Imagine:

# Each movie is a point in space.

# The textual metadata (like “action alien war”) is converted into a vector (a long arrow).

# The smaller the angle between two movie vectors, the more similar they are.

#     Let’s say:

# You and your friend describe two movies using words like: "action", "romance", "alien", "comedy", etc.

# You count how many times each word is used and put it in a table (a vector).
#     | Word    | Avatar (A) | Titanic (B) |
# | ------- | ---------- | ----------- |
# | action  | 2          | 0           |
# | alien   | 1          | 0           |
# | romance | 1          | 2           |
# | ship    | 0          | 1           |

#      Represent as vectors:

# Avatar = [2, 1, 1, 0]

# Titanic = [0, 0, 2, 1]

# Now what?
# Cosine similarity will:

# Ignore the length of the vectors

# Focus on direction (i.e., what words are used, not how many)

# Return a number between 0 and 1:

# 1 → exactly same words used (most similar)

# 0 → no words in common (totally different)
# You have a TF-IDF vector for each movie → like [0.1, 0.4, 0, 0.6, 0, ...]

# Cosine similarity is computed between every pair of movies

# The result is a matrix of size N x N (e.g., 1000x1000 if 1000 movies)

# cosine_sim[i][j] = similarity score between movie i and j