In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import sparse
from sklearn.model_selection import train_test_split

In [5]:
RAW_DIR = Path("data/raw/ml-1m")
PROCESSED_DIR = Path("data/processed")

In [7]:
ratings = pd.read_csv(RAW_DIR/'ratings.dat',
                     sep='::',
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'timestamp'])

In [8]:
movies = pd.read_csv(RAW_DIR/'movies.dat',
                    sep='::',
                    engine='python',
                    names=['movieId', 'title', 'genres'],
                    encoding='latin-1')

In [9]:
print("Ratings shape: ", ratings.shape)
print("Movies shape ", movies.shape)

Ratings shape:  (1000209, 4)
Movies shape  (3883, 3)


In [10]:
userId_map = {uid: i for i, uid in enumerate(ratings["userId"].unique())}
movieId_map = {mid: i for i, mid in enumerate(ratings["movieId"].unique())}

In [11]:
ratings["user_idx"] = ratings["userId"].map(userId_map)
ratings["movie_idx"] = ratings["movieId"].map(movieId_map)

In [13]:
num_users = len(userId_map)
num_movies = len(movieId_map)

In [14]:
print(f"Users: {num_users}, Movies: {num_movies}")

Users: 6040, Movies: 3706


In [15]:
rating_matrix = sparse.lil_matrix((num_users, num_movies))

In [16]:
for row in ratings.itertuples():
    rating_matrix[row.user_idx, row.movie_idx] = row.rating

In [17]:
binary_matrix = (rating_matrix >= 4).astype(np.float32)

In [19]:
sparse.save_npz(PROCESSED_DIR / "userItem_ratings.npz", rating_matrix.tocsr())
sparse.save_npz(PROCESSED_DIR / "userItem_binary.npz", binary_matrix.tocsr())

In [20]:
train_data = []
test_data = []

In [21]:
for u in range(num_users):
    user_ratings = ratings[ratings["user_idx"] == u]
    if len(user_ratings) < 5:
        train_data.append(user_ratings)
        continue
    train, test = train_test_split(user_ratings, test_size=0.2, random_state=42)
    train_data.append(train)
    test_data.append(test)

In [22]:
train_ratings = pd.concat(train_data)
test_ratings = pd.concat(test_data)

In [23]:
train_ratings.to_csv(PROCESSED_DIR / "train_ratings.csv", index=False)
test_ratings.to_csv(PROCESSED_DIR / "test_ratings.csv", index=False)

In [24]:
print("Train shape:", train_ratings.shape, "Test shape:", test_ratings.shape)

Train shape: (797758, 6) Test shape: (202451, 6)


In [26]:
movie_map = movies[["movieId", "title"]].copy()
movie_map["movie_idx"] = movie_map["movieId"].map(movieId_map)
movie_map.to_csv(PROCESSED_DIR / "movie_mapping.csv", index=False)