In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the u.data dataset
u_data = pd.read_csv('C:/Users/13447/Desktop/Stats thesis/ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Perform label encoding on user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

u_data['user_id'] = user_encoder.fit_transform(u_data['user_id'])
u_data['item_id'] = item_encoder.fit_transform(u_data['item_id'])

# Create the rating matrix
n_users = u_data['user_id'].nunique()
n_items = u_data['item_id'].nunique()



In [8]:
# Define the threshold
THRESHOLD = 3  # Ratings above this are "Lovers", below are "Haters"

# Create a user-item matrix with users as rows and items as columns, and fill missing values with 0
rating_matrix = u_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Keep a copy of the original rating matrix (before thresholding)
original_rating_matrix = rating_matrix.copy()

# Apply threshold classification to the rating matrix (after thresholding)
rating_matrix_thresholded = rating_matrix.apply(lambda x: x.map(lambda y: 1 if y > THRESHOLD else (-1 if y > 0 else 0)))

# Split the data into training (R) and testing (T) sets (80% train, 20% test)
train_data, test_data = train_test_split(u_data, test_size=0.2, random_state=42)

# Create the rating matrix for training data (R) and fill missing values with 0
rating_matrix_train = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Keep a copy of the original training rating matrix
original_rating_matrix_train = rating_matrix_train.copy()

# Apply threshold classification to the training rating matrix (after thresholding)
rating_matrix_train_thresholded = rating_matrix_train.apply(lambda x: x.map(lambda y: 1 if y > THRESHOLD else (-1 if y > 0 else 0)))

# Create the rating matrix for testing data (T) and fill missing values with 0
rating_matrix_test = test_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Keep a copy of the original testing rating matrix
original_rating_matrix_test = rating_matrix_test.copy()

# Apply threshold classification to the testing rating matrix (after thresholding)
rating_matrix_test_thresholded = rating_matrix_test.apply(lambda x: x.map(lambda y: 1 if y > THRESHOLD else (-1 if y > 0 else 0)))

# Display the first few rows of the original and thresholded training rating matrices
print("Original Training Rating Matrix:")
print(original_rating_matrix_train.head())

print("\nThresholded Training Rating Matrix:")
print(rating_matrix_train_thresholded.head())


Original Training Rating Matrix:
item_id  0     1     2     3     4     5     6     7     8     9     ...  \
user_id                                                              ...   
0         0.0   3.0   4.0   0.0   3.0   0.0   4.0   0.0   5.0   3.0  ...   
1         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

item_id  1667  1669  1670  1671  1672  1675  1677  1678  1679  1680  
user_id                                                              
0         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4         0.0 

In [4]:
# Create rI and rU indexes for training set (R)
# rI: item to user ratings (dictionary of items with lists of user ratings)
rI_train = {}
for _, row in train_data.iterrows():
    item_id = row['item_id']
    user_id = row['user_id']
    rating = row['rating']
    if item_id not in rI_train:
        rI_train[item_id] = []
    rI_train[item_id].append((user_id, rating))

# rU: user to item ratings (dictionary of users with lists of item ratings)
rU_train = {}
for _, row in train_data.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    rating = row['rating']
    if user_id not in rU_train:
        rU_train[user_id] = []
    rU_train[user_id].append((item_id, rating))

In [5]:
# Optionally, print rI and rU to check the data structure for training and testing
print("\nSample rI (item to user ratings) for Training Set:")
for item in list(rI_train.keys())[:2]:  # Display sample item indices
    print(f"Item {item}: {rI_train[item]}")

print("\nSample rU (user to item ratings) for Training Set:")
for user in list(rU_train.keys())[:2]:  # Display sample user indices
    print(f"User {user}: {rU_train[user]}")


Sample rI (item to user ratings) for Training Set:
Item 1410: [(806, 1), (324, 4), (659, 2), (193, 1), (933, 4), (384, 3), (245, 2), (931, 4), (845, 4), (842, 3), (268, 3), (307, 4), (621, 4), (726, 2), (304, 3), (797, 1), (434, 1), (199, 3), (587, 1), (278, 3), (270, 2), (880, 2), (803, 3), (306, 4), (302, 2)]
Item 658: [(473, 5), (773, 3), (920, 5), (12, 3), (658, 3), (804, 3), (814, 5), (415, 5), (124, 4), (84, 4), (58, 3), (17, 4), (869, 4), (715, 4), (369, 4), (748, 5), (642, 5), (526, 4), (282, 5), (312, 4), (693, 4), (235, 3), (263, 5), (296, 4), (502, 5), (217, 4), (835, 5), (406, 5), (888, 4), (386, 4), (513, 3), (765, 3), (697, 3), (325, 4), (882, 3), (408, 5), (567, 3), (495, 3), (252, 5), (392, 4), (238, 3), (320, 4), (891, 4), (451, 4), (378, 5), (434, 4), (59, 4), (425, 4), (766, 5), (795, 3), (384, 4), (797, 4), (400, 3), (449, 5), (353, 4), (638, 3), (193, 4), (566, 4), (397, 3), (917, 4), (453, 2), (941, 5), (911, 5), (663, 5), (652, 1), (540, 5), (931, 5), (737, 4), 