In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load data
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')

Preprocessing

In [16]:

# handle missing values
movies['genres'] = movies['genres'].fillna('')

# split genres into separate columns and one-hot encode
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])

genres_list = sorted(set(genre for sublist in movies['genres'].tolist() for genre in sublist))
for genre in genres_list:
    movies[genre] = movies['genres'].apply(lambda x: int(genre in x))

Aggregating features 

In [17]:
# Calculate average rating per user and per movie
user_avg_rating = ratings.groupby('userId')['rating'].mean().to_dict()
movie_avg_rating = ratings.groupby('movieId')['rating'].mean().to_dict()

# Total number of ratings per user and per movie
user_rating_count = ratings.groupby('userId')['rating'].count().to_dict()
movie_rating_count = ratings.groupby('movieId')['rating'].count().to_dict()


Clustering  

In [18]:
# Create user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
movie_item_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)

# Cluster users
kmeans_user = KMeans(n_clusters=10, random_state=42).fit(user_item_matrix)
user_clusters = kmeans_user.predict(user_item_matrix)
user_cluster_map = {user_id: cluster for user_id, cluster in zip(user_item_matrix.index, user_clusters)}

# Cluster movies
kmeans_movie = KMeans(n_clusters=10, random_state=42).fit(movie_item_matrix)
movie_clusters = kmeans_movie.predict(movie_item_matrix)
movie_cluster_map = {movie_id: cluster for movie_id, cluster in zip(movie_item_matrix.index, movie_clusters)}


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Rule-based preference 

In [19]:
# Simple rule-based recommendations
def get_similar_movies(movie_id, top_n=5):
    movie_genres = set(movies.loc[movies['movieId'] == movie_id, 'genres'].values[0])
    similar_movies = movies[movies['genres'].apply(lambda x: len(set(x) & movie_genres) > 0)]
    similar_movies = similar_movies[similar_movies['movieId'] != movie_id]
    return similar_movies.head(top_n)['movieId'].tolist()

def get_user_preferences(user_id):
    user_ratings = ratings[ratings['userId'] == user_id]
    top_movies = user_ratings.sort_values(by='rating', ascending=False).head(5)
    return top_movies['movieId'].tolist()

Prediction

In [20]:
def predict_rating(user_id, movie_id):
    # Check if user_id and movie_id exist in the clusters
    if user_id in user_cluster_map and movie_id in movie_cluster_map:
        user_cluster = user_cluster_map[user_id]
        movie_cluster = movie_cluster_map[movie_id]

        # Use user preferences and similar movies for prediction
        user_preferences = get_user_preferences(user_id)
        similar_movies = get_similar_movies(movie_id)
        
        # Combine user preferences and similar movies' average rating
        similar_ratings = ratings[ratings['movieId'].isin(similar_movies)]
        if not similar_ratings.empty:
            similar_avg_rating = similar_ratings['rating'].mean()
            predicted_rating = (user_avg_rating.get(user_id, 2.5) + similar_avg_rating) / 2
        else:
            predicted_rating = user_avg_rating.get(user_id, 2.5)

    else:
        # no prior data on user or movie 
        global_avg_rating = ratings['rating'].mean()
        predicted_rating = global_avg_rating

    return predicted_rating

Training + evaluating

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Predict on test set
test['predicted_rating'] = test.apply(lambda x: predict_rating(x['userId'], x['movieId']), axis=1)


mse = mean_squared_error(test['rating'], test['predicted_rating'])
print(f'MSE: {mse}')


MSE: 0.9419880932232476
