In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load datasets
movies_df = pd.read_excel('data/movies_df.xlsx')
movies_csv = pd.read_csv('data/movies.csv')
train_ratings = pd.read_csv('data/train_ratings.csv')
valid_data = pd.read_csv('data/valid_data.csv')

# Align movies_df with movies_csv based on movieId
aligned_movies_df = movies_csv[['movieId']].merge(movies_df, on='movieId', how='left')

# One-hot encode genres
genres_matrix = movies_csv['genres'].str.get_dummies(sep='|')

# Normalize numerical features from aligned_movies_df (excluding 'overview' column)
scaler = MinMaxScaler()
numeric_columns = aligned_movies_df.select_dtypes(include=[np.number]).columns
numerical_features = scaler.fit_transform(aligned_movies_df[numeric_columns])

# Combine features
movie_features = np.hstack([genres_matrix.values, numerical_features])

# Map movieId to the index in movie_features
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies_csv['movieId'])}

# Function to create a feature array for a given user-item pair
def get_features(row):
    movie_idx = movie_id_to_index.get(row['movieId'])
    if movie_idx is not None:
        movie_ftrs = movie_features[movie_idx]
        return np.hstack([row['rating'], movie_ftrs])
    return np.array([np.nan] * (1 + movie_features.shape[1]))

# Prepare training data
train_data = np.array([get_features(row) for _, row in train_ratings.iterrows()])
X_train = train_data[:, 1:]  # Exclude the rating column
y_train = train_data[:, 0]   # Only the rating column

# Prepare validation data
valid_data_features = np.array([get_features(row) for _, row in valid_data.iterrows()])
X_valid = valid_data_features[:, 1:]  # Exclude the rating column
y_valid = valid_data_features[:, 0]   # Only the rating column

from sklearn.impute import SimpleImputer

# Impute missing values in the training data
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_imputed, y_train)

# Impute missing values in the validation data
X_valid_imputed = imputer.transform(X_valid)

# Making predictions
y_pred = model.predict(X_valid_imputed)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f'RMSE: {rmse}')

RMSE: 0.9674339401543441


In [2]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import numpy as np

def create_user_item_matrix(df):
    """ Create a user-item matrix for collaborative filtering """
    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()
    
    user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    rows = df['userId'].map(user_id_to_idx)
    cols = df['movieId'].map(movie_id_to_idx)
    values = df['rating']
    
    return csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(movie_ids))), user_id_to_idx, movie_id_to_idx

# Create user-item matrices for train and validation data
train_matrix, train_user_id_to_idx, train_movie_id_to_idx = create_user_item_matrix(train_ratings)
valid_matrix, valid_user_id_to_idx, valid_movie_id_to_idx = create_user_item_matrix(valid_data)

In [3]:
from scipy.sparse import coo_matrix
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_biases(ratings_df):
    global_mean = ratings_df['rating'].mean()

    user_bias = ratings_df.groupby('userId')['rating'].mean() - global_mean
    item_bias = ratings_df.groupby('movieId')['rating'].mean() - global_mean

    return global_mean, user_bias, item_bias

def predict_with_biases(U, sigma, Vt, user_id_to_idx, movie_id_to_idx, global_mean, user_bias, item_bias, userId, movieId):
    user_idx = user_id_to_idx.get(userId)
    movie_idx = movie_id_to_idx.get(movieId)

    if user_idx is not None and movie_idx is not None:
        pred_rating = np.dot(np.dot(U[user_idx, :], sigma), Vt[:, movie_idx])
        pred_rating += global_mean
        pred_rating += user_bias.get(userId, 0)
        pred_rating += item_bias.get(movieId, 0)
        return pred_rating
    else:
        return global_mean

# Calculate biases
global_mean, user_bias, item_bias = calculate_biases(train_ratings)

# Perform SVD
U, sigma, Vt = svds(train_matrix, k=500)
sigma = np.diag(sigma)

# Make predictions and calculate RMSE
predicted_ratings = []
for _, row in valid_data.iterrows():
    predicted_rating = predict_with_biases(U, sigma, Vt, train_user_id_to_idx, train_movie_id_to_idx, 
                                           global_mean, user_bias, item_bias, row.userId, row.movieId)
    predicted_ratings.append(predicted_rating)

rmse = np.sqrt(mean_squared_error(valid_data['rating'], predicted_ratings))
rmse


0.9296953834375242