### 📝 Step 1: **Import Necessary Libraries**

In [None]:
# Importing all required libraries
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

### 📥 Step 2: **Load the Datasets**

In [None]:
# Load TMDB movie and credits datasets
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

# Display first row for inspection
movies.head(1)

### 🔗 Step 3: **Merge Datasets on Title**

In [None]:
# Merge credits and movies datasets on 'title'
movies = movies.merge(credits, on='title')
movies.head(1)

### 🧹 Step 4: **Select Relevant Features**

In [None]:
# Select only useful columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(1)

### 🔄 Step 5: **Define Helper Function to Extract Names**

In [None]:
# Function to extract 'name' from JSON string
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

### 🧬 Step 6: **Extract Genres, Keywords, Cast, Crew**

In [None]:
# Extract relevant features from JSON strings
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Take top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])

# Only extract the director's name
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

# Display transformed data
movies.head(1)

### 🏷️ Step 7: **Create Tags Column by Combining All Textual Data**

In [None]:
# Create a new 'tags' column by combining all features
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

# Final dataframe structure
movies = movies[['movie_id', 'title', 'overview', 'tags']]
movies.head()

### 📚 Step 8: **Vectorize Tags Using TF-IDF**

In [None]:
# Vectorize using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

### 🧠 Step 9: **Compute Cosine Similarity Matrix**

In [None]:
# Compute similarity scores between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### 🎯 Step 10: **Define Recommendation Function**

In [None]:
# Recommend top 10 similar movies based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in movies['title'].values:
        return "Movie not found in database."
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

### 🔍 Step 11: **Test Recommendation Function**

In [None]:
# Test the recommendation system
print(get_recommendations('The Dark Knight Rises'))

### 💾 Step 12: **Save Data and Similarity Matrix with Pickle**

In [None]:
# Save the model and data for reuse
with open('movie_data.pkl', 'wb') as file:
    pickle.dump((movies, cosine_sim), file)