In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the u.data dataset
u_data = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Perform label encoding on user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

u_data['user_id'] = user_encoder.fit_transform(u_data['user_id'])
u_data['item_id'] = item_encoder.fit_transform(u_data['item_id'])

# Create the rating matrix
n_users = u_data['user_id'].nunique()
n_items = u_data['item_id'].nunique()



MovieLens 100K Dataset
https://grouplens.org/datasets/movielens/

In [3]:
# Define the threshold
THRESHOLD = 3  # Ratings above this are "Lovers", below are "Haters"

# Create a user-item matrix with users as rows and items as columns, and fill missing values with 0
rating_matrix = u_data.pivot(index='user_id', columns='item_id', values='rating')

# Apply threshold classification to the rating matrix (after thresholding)
rating_matrix_thresholded = np.where(rating_matrix > THRESHOLD, 1, np.where(rating_matrix > 0, -1, 0))

# Convert numpy array to pandas DataFrame for easier inspection
rating_matrix_thresholded_df = pd.DataFrame(rating_matrix_thresholded)

# Display the first few rows of the original and thresholded training rating matrices
print("Original Rating Matrix:")
print(rating_matrix.head())

print("\nThresholded Rating Matrix:")
print(rating_matrix_thresholded_df.head())


Original Rating Matrix:
item_id  0     1     2     3     4     5     6     7     8     9     ...  \
user_id                                                              ...   
0         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
1         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  1672  1673  1674  1675  1676  1677  1678  1679  1680  1681  
user_id                                                              
0         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   N

In [4]:
# Create rI and rU indexes for training set (R)
# rI: item to user ratings (dictionary of items with lists of user ratings)
rI_train = {}
for _, row in u_data.iterrows():
    item_id = row['item_id']
    user_id = row['user_id']
    rating = row['rating']
    if item_id not in rI_train:
        rI_train[item_id] = []
    rI_train[item_id].append((user_id, rating))

# rU: user to item ratings (dictionary of users with lists of item ratings)
rU_train = {}
for _, row in u_data.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    rating = row['rating']
    if user_id not in rU_train:
        rU_train[user_id] = []
    rU_train[user_id].append((item_id, rating))

In [5]:
# Optionally, print rI and rU to check the data structure for training and testing
print("\nSample rI (item to user ratings) for Training Set:")
for item in list(rI_train.keys())[:2]:  # Display sample item indices
    print(f"Item {item}: {rI_train[item]}")

print("\nSample rU (user to item ratings) for Training Set:")
for user in list(rU_train.keys())[:2]:  # Display sample user indices
    print(f"User {user}: {rU_train[user]}")


Sample rI (item to user ratings) for Training Set:
Item 241: [(np.int64(195), np.int64(3)), (np.int64(62), np.int64(3)), (np.int64(225), np.int64(5)), (np.int64(153), np.int64(3)), (np.int64(305), np.int64(5)), (np.int64(295), np.int64(4)), (np.int64(33), np.int64(5)), (np.int64(270), np.int64(4)), (np.int64(200), np.int64(4)), (np.int64(208), np.int64(4)), (np.int64(34), np.int64(2)), (np.int64(353), np.int64(5)), (np.int64(198), np.int64(5)), (np.int64(112), np.int64(2)), (np.int64(0), np.int64(5)), (np.int64(172), np.int64(5)), (np.int64(359), np.int64(4)), (np.int64(233), np.int64(4)), (np.int64(13), np.int64(4)), (np.int64(308), np.int64(4)), (np.int64(330), np.int64(4)), (np.int64(20), np.int64(3)), (np.int64(110), np.int64(4)), (np.int64(438), np.int64(5)), (np.int64(354), np.int64(4)), (np.int64(203), np.int64(5)), (np.int64(144), np.int64(5)), (np.int64(29), np.int64(5)), (np.int64(462), np.int64(2)), (np.int64(143), np.int64(4)), (np.int64(416), np.int64(3)), (np.int64(1), n