In [2]:
import pandas as pd
import numpy as np

books = pd.read_csv(
    'BX-Books.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)

users = pd.read_csv(
    'BX-Users.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)

ratings = pd.read_csv(
    'BX-Book-Ratings.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)


In [3]:
books.drop(columns=['Image-URL-S' , 'Image-URL-M'] , inplace=True)

books.rename(columns={'Book-Title':'title',
                      'Book-Author':'author',
                      'Year-Of-Publication':'year',
                      'Publisher':'publisher',
                      'Image-URL-L':'image-url',
                      'ISBN':'isbn'} , inplace=True)



In [4]:
users.rename(columns={'User-ID':'user_id','Location':'location','Age':'age'} , inplace=True)

In [5]:
ratings.rename(columns={'User-ID':'user_id','ISBN':'isbn','Book-Rating':'rating'} , inplace=True)

In [6]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
ratings = ratings[ratings['rating'] > 0]
ratings.head()


Unnamed: 0,user_id,isbn,rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [8]:
# Filter users
valid_users = ratings['user_id'].value_counts()
valid_users = valid_users[valid_users >= 5].index
ratings = ratings[ratings['user_id'].isin(valid_users)]

# Filter books
valid_items = ratings['isbn'].value_counts()
valid_items = valid_items[valid_items >= 5].index
ratings = ratings[ratings['isbn'].isin(valid_items)]

ratings.shape


(141081, 3)

In [10]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['user_id'])
ratings['item'] = item_encoder.fit_transform(ratings['isbn'])

num_users = ratings['user'].nunique()
num_items = ratings['item'].nunique()

num_users, num_items


(13030, 11234)

In [12]:
ratings = ratings.sort_values(['user', 'rating'])


In [13]:
# last interaction → test
test_df = ratings.groupby('user').tail(1)

# rest → train
train_df = ratings.drop(test_df.index)

train_df.shape, test_df.shape


((128051, 5), (13030, 5))

In [None]:
import torch

user_tensor = torch.LongTensor(train_df['user'].values)
item_tensor = torch.LongTensor(train_df['item'].values)


In [None]:
from collections import defaultdict

train_interactions = defaultdict(set)

for u, i in zip(train_df['user'], train_df['item']):
    train_interactions[u].add(i)
