In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import gc


In [None]:
# Load datasets
movies = pd.read_csv('movies_datasets/movies.csv')
ratings = pd.read_csv('movies_datasets/ratings.csv')
tags = pd.read_csv('movies_datasets/tags.csv')
links = pd.read_csv('movies_datasets/links.csv')

# Display sample data
print("Movies:\n", movies.head())
print("\nRatings:\n", ratings.head())
print("\nTags:\n", tags.head())
print("\nLinks:\n", links.head())

In [None]:
# Data Preprocessing
def merge_in_chunks(ratings, movies, tags, chunk_size=5000):
    # First merge movies (usually smaller) with tags
    movies_with_tags = movies.merge(tags, on='movieId', how='left')
    
    # Process ratings in chunks and merge with movies_with_tags
    merged_chunks = []
    
    # Process in smaller chunks
    for start in range(0, len(ratings), chunk_size):
        # Get a chunk of ratings
        ratings_chunk = ratings.iloc[start:start + chunk_size]
        
        # Merge this chunk with movies_with_tags
        chunk_result = ratings_chunk.merge(
            movies_with_tags[['movieId', 'title', 'genres', 'tag']], 
            on='movieId', 
            how='left'
        )
        
        # Append to results
        merged_chunks.append(chunk_result)
        
        # Clear memory
        del chunk_result
        gc.collect()
    
    # Combine all chunks
    return pd.concat(merged_chunks, ignore_index=True)

# Optimize memory usage before processing
def optimize_df(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Optimize all dataframes
ratings = optimize_df(ratings)
movies = optimize_df(movies)
tags = optimize_df(tags)

# Keep only necessary columns
ratings = ratings[['userId', 'movieId', 'rating']]
movies = movies[['movieId', 'title', 'genres']]
tags = tags[['movieId', 'userId', 'tag']]

# Clear memory
gc.collect()

try:
    # Perform the chunked merge with smaller chunk size
    merged = merge_in_chunks(ratings, movies, tags, chunk_size=5000)
    
    # Post-processing steps
    merged['tag'] = merged['tag'].fillna('')  # Replace NaN in tags with empty strings
    merged['content'] = merged['genres'] + ' ' + merged['tag']
    
    # Preview merged data
    print("\nMerged data preview:")
    print(merged.head())
    print("\nMerged shape:", merged.shape)
    
except MemoryError as e:
    print("Still experiencing memory issues. Consider these options:")
    print("1. Reduce the dataset size by sampling")
    print("2. Process the data in even smaller chunks")
    print("3. Use a machine with more RAM")
    raise e

# Final cleanup
gc.collect()

In [None]:
# Content-Based Filtering (Recommendation Based on Movie Metadata)
def content_based_recommendations(movie_title, n=10):
    # Vectorize the content
    count_vectorizer = CountVectorizer(stop_words='english')
    count_matrix = count_vectorizer.fit_transform(merged['content'])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    # Get index of the movie
    movie_idx = merged[merged['title'] == movie_title].index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Fetch top N similar movies
    top_movies = [merged.iloc[i[0]]['title'] for i in sim_scores[1:n+1]]
    return top_movies

# Example usage
print(content_based_recommendations("Toy Story (1995)", 5))


In [None]:
# Collaborative Filtering (User-Based Recommendations)
def collaborative_filtering(user_id, n=10):
    # Create a user-item matrix
    user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    # Compute cosine similarity between users
    user_similarity = cosine_similarity(user_item_matrix)
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:].index

    # Recommend movies rated highly by similar users
    user_movies = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    recommendations = []

    for other_user in similar_users:
        other_user_movies = set(user_item_matrix.loc[other_user][user_item_matrix.loc[other_user] > 0].index)
        recommendations.extend(other_user_movies - user_movies)

        if len(recommendations) >= n:
            break

    recommended_movies = [movies[movies['movieId'] == movie_id]['title'].values[0] for movie_id in recommendations[:n]]
    return recommended_movies

# Example usage
print(collaborative_filtering(1, 5))


In [None]:
# Save content-based and collaborative filtering models
import pickle

with open('content_based_model.pkl', 'wb') as cb_file:
    pickle.dump(content_based_recommendations, cb_file)

with open('collaborative_filtering_model.pkl', 'wb') as cf_file:
    pickle.dump(collaborative_filtering, cf_file)

print("Models saved successfully.")
