**Movie Recommendation System with SVD (Singular Value Decomposition)**

**Overview**

This project demonstrates a movie recommendation system built using Singular Value Decomposition (SVD) from the Surprise library. The system is trained on the MovieLens dataset, which consists of user ratings for various movies. The goal is to provide personalized movie recommendations to users based on their past ratings.

**Steps and Code Implementation**

# 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
import ast
import warnings

palette = ['#8abf87', '#ffb6b9', '#ffe156', '#6a74b9', '#ff9a8b', '#d8e2dc', '#f0a6ca']
sns.set_theme(context='notebook', palette=palette, style='white')

warnings.simplefilter(action='ignore', category=FutureWarning)


# 2. Load and Merge Datasets

In [None]:
movies_metadata_path = 'movies_metadata.csv'
ratings_small_path = 'ratings_small.csv'

movies = pd.read_csv(movies_metadata_path, low_memory=False)
ratings = pd.read_csv(ratings_small_path)

movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies.dropna(subset=['id'], inplace=True)

merged_data = pd.merge(ratings, movies, left_on='movieId', right_on='id', how='inner')

merged_data = merged_data[['userId', 'movieId', 'rating', 'genres', 'timestamp', 'budget', 'imdb_id',
                           'original_language', 'popularity', 'revenue', 'runtime', 'title',
                           'vote_average', 'vote_count', 'belongs_to_collection', 'release_date']]

# 3. Exploratory Data Analysis (EDA)

**Distribution of Movie Genres**

In [None]:
merged_data['genres'] = merged_data['genres'].fillna('[]')
merged_data['genres'] = merged_data['genres'].apply(ast.literal_eval)
merged_data['genre_names'] = merged_data['genres'].apply(lambda x: [d['name'] for d in x])

genre_dummies = merged_data['genre_names'].str.join('|').str.get_dummies()
merged_data = pd.concat([merged_data, genre_dummies], axis=1)
merged_data.drop(['genres', 'genre_names'], axis=1, inplace=True)

genre_counts = genre_dummies.sum().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette=palette)
plt.title('Distribution of Movie Genres')
plt.xlabel('Number of Movies')
plt.ylabel('Genres')
plt.show()

**Distribution of User Ratings**

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(merged_data['rating'], bins=10, kde=False, color=palette[0])
plt.title('Distribution of User Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

**Correlation Heatmap of Numeric Features**

In [None]:
numeric_features = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
correlation_matrix = merged_data[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

**User Rating Behavior**

In [None]:
user_ratings = merged_data.groupby('userId')['rating'].agg(['mean', 'count'])

plt.figure(figsize=(10, 6))
sns.scatterplot(x=user_ratings['count'], y=user_ratings['mean'], alpha=0.7)
plt.title('User Rating Behavior')
plt.xlabel('Number of Ratings')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

# 4. Data Preparation for SVD Model

In [None]:
svd_data = merged_data[['userId', 'movieId', 'rating']]

reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(svd_data[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 5. Build the SVD Model

In [None]:
svd = SVD()

svd.fit(trainset)

predictions = svd.test(testset)

from surprise import accuracy
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

# 6. Generate Top-N Recommendations

In [None]:
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

user_id = 3
recommended_movie_ids = [iid for (iid, _) in top_n[user_id]]

recommended_movies = movies[movies['id'].isin(recommended_movie_ids)][['title']]
print(f"Top recommended movies for user {user_id}:")
print(recommended_movies)