# MMD 2024, Collaborative Filtering on Google Colab
This notebook sets up the enviroment and runs CF experiments on Google Colab.





In [1]:
# Clone the repository to local runtime

private = False
if private:
    # Private repository, requires authentication
    from google.colab import userdata
    pat = userdata.get('github_pat')
    project = '24WS-mmd-code-priv'
else:
    pat = ''
    project = '24WS-mmd-code-public'

In [2]:
!git clone https://{pat}@github.com/aip-hd-tea/{project}.git

fatal: destination path '24WS-mmd-code-public' already exists and is not an empty directory.


In [3]:
# Import the repository code
import sys
sys.path.insert(0,f"/content/{project}")

import data_util as cfd

# After edits of cf_algorithms_to_complete.py:
# 1. Rename the file rec_sys.cf_algorithms_to_complete.py to rec_sys.cf_algorithms.py
# 2. Restart the runtime (Runtime -> Restart the session); possibly not needed
# 3. Swap the comments in the next two lines, so that cf_algorithms is imported as cfa
#import rec_sys.cf_algorithms_to_complete as cfa
import cf_algorithms as cfa
# 4. Re-run all cells
# 5. If your changes are correct, you will see a long
#    printout of recommendations for MovieLens dataset (last cell)

Test 1 - Centered Cosine Similarity: -1.0000000000000002
Test 2 - Centered Cosine Similarity with NaNs: -0.8525776669857377
Test 3 - Fast Centered Cosine Similarity: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [4]:
# Load or set the configuration
#from rec_sys.cf_config import config

import dataclasses
@dataclasses.dataclass
class config:
    max_rows: int = int(1e5)
    dowload_url: str = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    download_dir: str = "/content/"
    unzipped_dir: str = download_dir + "ml-25m/"
    file_path: str = download_dir + "ml-25m/ratings.csv"


In [5]:
# Load the MovieLens and Lecture datasets
um_movielens = cfd.get_um_by_name(config, "movielens")
um_lecture = cfd.get_um_by_name(config, "lecture_1")

# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)

Dir '/content/ml-25m/' already exists, skipping download

### Start reading data from '/content/ml-25m/ratings.csv'
Loaded data from '/content/ml-25m/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 

>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [5 2], rating: -2.6778371217101276
item_idx: 1, neighbors: [2 3], rating: 4.27917451131852
all_ratings lecture toy dataset: [-2.6778371217101276, 4.27917451131852, 2.0, 5.0, 4.0, 3.0]

>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2

item_idx: 70, neighbors: [645 420], rating: 2.934051275253296
item_idx: 71, neighbors: [439 755], rating: 2.4837892055511475
item_idx: 72, neighbors: [ 37 678], rating: 3.000000238418579
item_idx: 73, neighbors: [2

  um_normalized = utility_matrix / norms


item_idx: 372, neighbors: [547   2], rating: 3.5
item_idx: 373, neighbors: [496   2], rating: 3.005983352661133
item_idx: 374, neighbors: [ 69 216], rating: 3.754026174545288
item_idx: 375, neighbors: [284  37], rating: 4.0
item_idx: 376, neighbors: [424 284], rating: 4.576547622680664
item_idx: 377, neighbors: [755  37], rating: 3.1974542140960693
item_idx: 378, neighbors: [476  37], rating: 2.7352054119110107
item_idx: 379, neighbors: [11 37], rating: 3.7891178131103516
item_idx: 380, neighbors: [755 678], rating: 3.271562337875366
item_idx: 381, neighbors: [225  37], rating: 2.999999761581421
item_idx: 382, neighbors: [ 75 439], rating: 3.724138021469116
item_idx: 383, neighbors: [225 551], rating: 3.4622249603271484
item_idx: 384, neighbors: [430 225], rating: 1.0
item_idx: 385, neighbors: [  2 476], rating: 4.065699577331543
item_idx: 386, neighbors: [ 11 612], rating: 4.245109558105469
item_idx: 387, neighbors: [755  37], rating: 3.1974542140960693
item_idx: 388, neighbors: [225 

  rating_of_item = np.sum(similarities[best_among_who_rated] * orig_utility_matrix[item_index, best_among_who_rated]) / np.sum(similarities[best_among_who_rated])


item_idx: 3867, neighbors: [395 106], rating: 3.7249326705932617
item_idx: 3868, neighbors: [349 546], rating: 3.525580406188965
item_idx: 3869, neighbors: [106], rating: 3.500000238418579
item_idx: 3870, neighbors: [106 755], rating: 3.5
item_idx: 3871, neighbors: [170 755], rating: 4.500000476837158
item_idx: 3872, neighbors: [227 425], rating: 3.319439172744751
item_idx: 3873, neighbors: [106], rating: 3.0
item_idx: 3874, neighbors: [546 119], rating: 5.0
item_idx: 3875, neighbors: [106 543], rating: 4.218432426452637
item_idx: 3876, neighbors: [445 106], rating: 2.793471336364746
item_idx: 3877, neighbors: [106 170], rating: 3.7544171810150146
item_idx: 3878, neighbors: [225 342], rating: 3.781912326812744
item_idx: 3879, neighbors: [106], rating: 3.500000238418579
item_idx: 3880, neighbors: [540 106], rating: 3.6455535888671875
item_idx: 3881, neighbors: [174 106], rating: 4.0
item_idx: 3882, neighbors: [106 464], rating: 3.2611312866210938
item_idx: 3883, neighbors: [279 106], ra

In [6]:
# Task 4
rated_by, user_col = cfa.process_movielens_data(um_movielens)

print("Rated by structure:", rated_by)
print("User column structure:", user_col)

# Task 5
user_ids = [828, 2400, 3765, 4299, 5526, 6063, 7045, 8160, 9682, 10277]
item_ids = [11, 4725, 1270, 4020, 2432, 4525, 4100, 6300, 1212, 7355]
neighborhood_size = len(user_ids)

for idx, (user_id, item_id) in enumerate(zip(user_ids, item_ids), start=1):
    estimated_rating = cfa.estimate_rating(user_id, item_id, um_movielens, neighborhood_size)

    print(f"Pair {idx}: User {user_id}, Item {item_id}, Estimated Rating: {estimated_rating}")

IndexError: Index dimension must be 1 or 2