In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load datasets
movies_df = pd.read_excel('data/movies_df.xlsx')
movies_csv = pd.read_csv('data/movies.csv')
train_ratings = pd.read_csv('data/train_ratings.csv')
# split train data into train and validation data
valid_data = train_ratings.iloc[int(len(train_ratings)*0.8):]
train_ratings = train_ratings.iloc[:int(len(train_ratings)*0.8)]

# Align movies_df with movies_csv based on movieId
aligned_movies_df = movies_csv[['movieId']].merge(movies_df, on='movieId', how='left')

# One-hot encode genres
genres_matrix = movies_csv['genres'].str.get_dummies(sep='|')

# Normalize numerical features from aligned_movies_df (excluding 'overview' column)
scaler = MinMaxScaler()
numeric_columns = aligned_movies_df.select_dtypes(include=[np.number]).columns
numerical_features = scaler.fit_transform(aligned_movies_df[numeric_columns])

# Combine features
movie_features = np.hstack([genres_matrix.values, numerical_features])

# Map movieId to the index in movie_features
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies_csv['movieId'])}

# Function to create a feature array for a given user-item pair
def get_features(row):
    movie_idx = movie_id_to_index.get(row['movieId'])
    if movie_idx is not None:
        movie_ftrs = movie_features[movie_idx]
        return np.hstack([row['rating'], movie_ftrs])
    return np.array([np.nan] * (1 + movie_features.shape[1]))

# Prepare training data
train_data = np.array([get_features(row) for _, row in train_ratings.iterrows()])
X_train = train_data[:, 1:]  # Exclude the rating column
y_train = train_data[:, 0]   # Only the rating column

# Prepare validation data
valid_data_features = np.array([get_features(row) for _, row in valid_data.iterrows()])
X_valid = valid_data_features[:, 1:]  # Exclude the rating column
y_valid = valid_data_features[:, 0]   # Only the rating column

from sklearn.impute import SimpleImputer

# Impute missing values in the training data
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_imputed, y_train)

# Impute missing values in the validation data
X_valid_imputed = imputer.transform(X_valid)

# Making predictions
y_pred = model.predict(X_valid_imputed)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f'RMSE: {rmse}')

RMSE: 0.973999806759077
