In [22]:
# Imports
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# Load and merge datasets
def load_and_merge_data(movies_path, credits_path):
    movies_df = pd.read_csv(movies_path)
    credits_df = pd.read_csv(credits_path)
    
    credits_df.rename(columns={'movie_id': 'id'}, inplace=True)
    merged_df = movies_df.merge(credits_df, on='id')
    
    # Fix title column if renamed during merge
    if 'title_x' in merged_df.columns:
        merged_df.rename(columns={'title_x': 'title'}, inplace=True)
    elif 'title_y' in merged_df.columns:
        merged_df.rename(columns={'title_y': 'title'}, inplace=True)
    
    return merged_df


In [30]:
# Parse features
def parse_features(df):
    def convert(obj):
        try:
            L = [i['name'] for i in ast.literal_eval(obj)]
        except:
            L = []
        return L

    def get_top_3_cast(obj):
        try:
            L = [i['name'] for i in ast.literal_eval(obj)]
            return L[:3]
        except:
            return []

    def get_director(obj):
        try:
            for i in ast.literal_eval(obj):
                if i['job'] == 'Director':
                    return [i['name']]
            return []
        except:
            return []

    df['genres'] = df['genres'].apply(convert)
    df['keywords'] = df['keywords'].apply(convert)
    df['cast'] = df['cast'].apply(get_top_3_cast)
    df['crew'] = df['crew'].apply(get_director)

    df['overview'] = df['overview'].fillna('')
    df['tags'] = df['overview'] + ' ' + df['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                 df['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                 df['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                 df['crew'].apply(lambda x: ' '.join(x))
    df['tags'] = df['tags'].str.lower()

    return df[['id', 'title', 'tags']]

In [31]:
# Compute similarity matrix
def compute_similarity(df):
    cv = CountVectorizer(max_features=5000, stop_words='english')
    vectors = cv.fit_transform(df['tags']).toarray()
    similarity = cosine_similarity(vectors)
    return similarity

In [32]:
# Main pipeline
def main():
    movies_path = 'data/tmdb_5000_movies.csv'
    credits_path = 'data/tmdb_5000_credits.csv'

    print("Loading and merging data...")
    df = load_and_merge_data(movies_path, credits_path)

    print("Parsing features...")
    df = parse_features(df)

    # Reduce number of movies for optimized memory usage
    N = 500  
    df_small = df.head(N)

    print("Computing similarity matrix...")
    similarity = compute_similarity(df_small)

    print("Saving files...")
    with open('movie_list.pkl', 'wb') as f:
        pickle.dump(df_small, f)

    with open('similarity.pkl', 'wb') as f:
        pickle.dump(similarity, f)

    print("Done. Files saved: movie_list.pkl and similarity.pkl.")

main()

Loading and merging data...
Parsing features...
Computing similarity matrix...
Saving files...
Done. Files saved: movie_list.pkl and similarity.pkl.
