In [3]:
# Notebook: preprocess -> build movie_list.pkl and similarity.pkl
import os, ast, pickle
print(os.getcwd())

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. load
movies_csv = 'tmdb_5000_movies.csv'
credits_csv = 'tmdb_5000_credits.csv'
movies = pd.read_csv(movies_csv)
credits = pd.read_csv(credits_csv)

# 2. merge on title (careful: ensure titles match)
movies = movies.merge(credits, on='title')

# 3. keep only useful columns
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

# drop rows with NaNs in these fields
movies.dropna(inplace=True)

# helper to parse the JSON-like columns
def parse_names(text):
    L = []
    for d in ast.literal_eval(text):
        if 'name' in d:
            L.append(d['name'])
    return L

# convert columns
movies['genres']   = movies['genres'].apply(parse_names)
movies['keywords'] = movies['keywords'].apply(parse_names)

# cast: keep top 3 cast names
def parse_cast(text):
    L = []
    for i, d in enumerate(ast.literal_eval(text)):
        if i < 3 and 'name' in d:
            L.append(d['name'])
    return L
movies['cast'] = movies['cast'].apply(parse_cast)

# crew: extract directors only
def fetch_director(text):
    L=[]
    for d in ast.literal_eval(text):
        if d.get('job') == 'Director' and 'name' in d:
            L.append(d['name'])
    return L
movies['crew'] = movies['crew'].apply(fetch_director)

# remove spaces in multi-word tokens to make them single tokens
def collapse(L):
    return [s.replace(" ", "") for s in L]

movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

# overview -> list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# create tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies_final = movies[['movie_id','title','tags']].copy()
movies_final['tags'] = movies_final['tags'].apply(lambda x: " ".join(x))

# 4. vectorize and compute similarity
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_final['tags']).toarray()
similarity = cosine_similarity(vectors)

# Create `model/` directory and save files
os.makedirs('model', exist_ok=True)
pickle.dump(movies_final, open('model/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('model/similarity.pkl', 'wb'))

print("Saved model/movie_list.pkl and model/similarity.pkl")



c:\Users\dell\Desktop\movie_recommendation
Saved model/movie_list.pkl and model/similarity.pkl
