In [1]:
# Download and extract dataset
!curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   885k      0  0:00:01  0:00:01 --:--:--  895k


In [2]:
import pandas as pd
import sys
import os

# Add project root directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [9]:
# Normal imports for functions
from src.data.preprocessing import preprocess_data
from src.models.neural_cf import MovieRecommender
from src.utils.evaluation import calculate_metrics

# Import the modules for reloading
import src.data.preprocessing as preprocess_data_module
import src.models.neural_cf as MovieRecommender_module
import src.utils.evaluation as calculate_metrics_module

# Reload the modules after making changes to them
import importlib
importlib.reload(preprocess_data_module)
importlib.reload(MovieRecommender_module)
importlib.reload(calculate_metrics_module)



<module 'src.utils.evaluation' from '/Users/abowkis/Documents/movie_recommender/movie-recs/src/utils/evaluation.py'>

In [4]:
# Load and preprocess data
processed_data = preprocess_data('data/ml-latest-small/ratings.csv', 
                                         'data/ml-latest-small/movies.csv', 1)
# Get the number of rows and columns
print(f"Rows: {processed_data['train_data'].shape[0]}, Columns: {processed_data['train_data'].shape[1]}")

Rows: 116896, Columns: 7


In [5]:
# Extract user IDs, movie IDs and ratings for training and testing data split
print(processed_data['train_data'].head(4))
user_ids_train = processed_data['train_data']['userId'].values
movie_ids_train = processed_data['train_data']['movieId'].values
ratings_train = processed_data['train_data']['scaled_rating'].values
user_ids_test = processed_data['test_data']['userId'].values
movie_ids_test = processed_data['test_data']['movieId'].values
ratings_test = processed_data['test_data']['scaled_rating'].values

# Make training input and target
X_train = (user_ids_train, movie_ids_train)  # Tuple of user and movie IDs to pass to model
y_train = ratings_train  # Target ratings (scaled)

       userId  movieId  rating   timestamp  scaled_rating  \
57429     578     1208     5.0   958882188       1.000000   
19549      61     6058     4.0  1521490334       0.777778   
38805     602     2642     5.0   963176106       1.000000   
27956      83      891     4.0   860397394       0.777778   

                                                   title    genres  
57429                              Apocalypse Now (1979)       War  
19549                         Final Destination 2 (2003)    Horror  
38805                                Superman III (1983)    Action  
27956  Halloween: The Curse of Michael Myers (Hallowe...  Thriller  


In [6]:
# Initialize model
recommender = MovieRecommender(processed_data['num_users'], processed_data['num_movies'])
# Compile model
recommender.compile_model(learning_rate=0.001)
# Train model
history = recommender.train(X_train=X_train, y_train=y_train, validation_split=0.2, batch_size=64, epochs=100, patience=3)

Epoch 1/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - loss: 0.0443 - mae: 0.1640 - mse: 0.0443 - val_loss: 0.0341 - val_mae: 0.1418 - val_mse: 0.0341
Epoch 2/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - loss: 0.0319 - mae: 0.1363 - mse: 0.0319 - val_loss: 0.0321 - val_mae: 0.1366 - val_mse: 0.0321
Epoch 3/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0291 - mae: 0.1293 - mse: 0.0291 - val_loss: 0.0308 - val_mae: 0.1329 - val_mse: 0.0308
Epoch 4/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 0.0258 - mae: 0.1216 - mse: 0.0258 - val_loss: 0.0292 - val_mae: 0.1289 - val_mse: 0.0292
Epoch 5/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - loss: 0.0229 - mae: 0.1140 - mse: 0.0229 - val_loss: 0.0278 - val_mae: 0.1248 - val_mse: 0.0278
Epoch 6/100
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [7]:
# Evaluate model
brief_metrics = recommender.evaluate(X_train, y_train)
print("Test metrics:", brief_metrics)
# Make predictions on test set
predicted_ratings = recommender.predict(user_ids_test, movie_ids_test)
print("Predicted ratings:", predicted_ratings[:5])

Test metrics: {'loss': 0.005815394222736359, 'compile_metrics': 0.04898960143327713}
Predicted ratings: [[0.54572994]
 [0.9623438 ]
 [0.9214805 ]
 [0.81133443]
 [0.85805297]]


In [10]:
# Create dataframe with actual and predicted ratings for test set to pass to calculate_metrics function
df_predictions = pd.DataFrame({
    'userId': user_ids_test,        
    'movieId': movie_ids_test,      
    'actual_rating': ratings_test,  
    'predicted_rating': predicted_ratings.flatten()
})
full_metrics = calculate_metrics(df_predictions)
print("Full metrics:", full_metrics)

Full metrics: {'rmse': 0.12806119118224119, 'mae': 0.08189337235500312, 'precision@k': 0.7886885245901639, 'recall@k': 0.6162817537335565}
