In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
# Load the data
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")

In [6]:
# Check the first few rows of each dataframe to understand the data
print(ratings.head())
print(movies.head())
print(tags.head())
print(links.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre

In [7]:
# Merge ratings and movies on movieId
merged_data = pd.merge(ratings, movies, on='movieId')

# Check the first few rows of the merged dataset
print(merged_data.head())

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [8]:
# Split data into edx and final_holdout_test sets (90% for edx, 10% for final_holdout_test)
edx, final_holdout_test = train_test_split(merged_data, test_size=0.1, random_state=1)

# Check the size of the splits
print(f"edx size: {edx.shape}")
print(f"final_holdout_test size: {final_holdout_test.shape}")

edx size: (90752, 6)
final_holdout_test size: (10084, 6)


In [9]:
# Ensure userId and movieId in final_holdout_test are also in edx
final_holdout_test = final_holdout_test[
    final_holdout_test['movieId'].isin(edx['movieId']) & 
    final_holdout_test['userId'].isin(edx['userId'])
]

In [10]:
# Calculate the average rating for each movie
movie_avg_rating = edx.groupby('movieId')['rating'].mean()

# Merge the average ratings into the final_holdout_test set
final_holdout_test = pd.merge(final_holdout_test, movie_avg_rating, on='movieId', suffixes=('', '_avg'))

# Calculate RMSE (Root Mean Square Error)
from sklearn.metrics import mean_squared_error

# Compute RMSE between actual ratings and predicted average ratings
rmse = np.sqrt(mean_squared_error(final_holdout_test['rating'], final_holdout_test['rating_avg']))
print(f'Baseline RMSE: {rmse}')

Baseline RMSE: 0.9665500546659129
