In [7]:
# Notebook: preprocess -> build compressed movie_list.pkl + similarity_compressed.pkl.gz

import os, ast, pickle, gzip
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Working directory:", os.getcwd())

# 1. load raw files
movies_csv = 'tmdb_5000_movies.csv'
credits_csv = 'tmdb_5000_credits.csv'

movies = pd.read_csv(movies_csv)
credits = pd.read_csv(credits_csv)

# 2. merge on title
movies = movies.merge(credits, on='title')

# 3. keep useful columns
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)

# ---- parsing helpers ----
import ast

def parse_names(text):
    L = []
    for d in ast.literal_eval(text):
        if 'name' in d:
            L.append(d['name'])
    return L

def parse_cast(text):
    L = []
    for i, d in enumerate(ast.literal_eval(text)):
        if i < 3 and 'name' in d:
            L.append(d['name'])
    return L

def fetch_director(text):
    L = []
    for d in ast.literal_eval(text):
        if d.get('job') == 'Director':
            L.append(d['name'])
    return L

def collapse(L):
    return [s.replace(" ", "") for s in L]

# parse data
movies['genres'] = movies['genres'].apply(parse_names).apply(collapse)
movies['keywords'] = movies['keywords'].apply(parse_names).apply(collapse)
movies['cast'] = movies['cast'].apply(parse_cast).apply(collapse)
movies['crew'] = movies['crew'].apply(fetch_director).apply(collapse)

# overview list
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies_final = movies[['movie_id','title','tags']].copy()
movies_final['tags'] = movies_final['tags'].apply(lambda x: " ".join(x))

# 4. vectorize + similarity
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_final['tags']).toarray()
similarity = cosine_similarity(vectors)

# ðŸ”¥ðŸ”¥ MAIN FIX â†’ Reduce matrix to FLOAT16
similarity = similarity.astype('float16')

# 5. save inside model/
os.makedirs("model", exist_ok=True)

# movie_list
pickle.dump(movies_final, open("model/movie_list.pkl", "wb"))

# COMPRESSION for GitHub ðŸ‘‡
with gzip.open("model/similarity_compressed.pkl.gz", "wb") as f:
    pickle.dump(similarity, f, protocol=pickle.HIGHEST_PROTOCOL)

print("Saved model/movie_list.pkl and model/similarity_compressed.pkl.gz")


Working directory: c:\Users\dell\Desktop\movie_recommendation
Saved model/movie_list.pkl and model/similarity_compressed.pkl.gz
