# Packs 

In [1]:
import pandas as pd
import torch

In [3]:
print("MPS built:", torch.backends.mps.is_built())
print("MPS available:", torch.backends.mps.is_available())

MPS built: True
MPS available: True


# Read Data

In [4]:
train_df = pd.read_parquet('../artifacts/movielens_splits/df_train.parquet') 
val_df = pd.read_parquet('../artifacts/movielens_splits/df_val.parquet')
test_df = pd.read_parquet('../artifacts/movielens_splits/df_test.parquet')

In [5]:
train_df.head()

Unnamed: 0,user_id,seq_actions,seq_times,target,hist_len
0,4671,"[movie_2634, movie_110, movie_3702, movie_1961...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",movie_165,144
1,53,"[movie_902, movie_694, movie_1961, movie_1193,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",movie_2148,400
2,5377,"[movie_3480, movie_1198, movie_1097, movie_140...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",movie_1036,25
3,2565,"[movie_587, movie_2072, movie_2085, movie_19, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",movie_542,180
4,123,"[movie_2762, movie_356, movie_1320, movie_1268...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",movie_1304,65


In [6]:
from src.utils.general import flatten_list_of_lists

train_flows_list = [list(flow) for flow in train_df.seq_actions.tolist()]
train_times_flows_list = [list(times_flow) for times_flow in train_df.seq_times.tolist()]
distinct_new_tokens_list = list(set(flatten_list_of_lists(train_flows_list)))
len(train_flows_list), len(distinct_new_tokens_list)

(695951, 3599)

In [7]:
movies_vocab = {}
for i,m in enumerate(distinct_new_tokens_list):
  movies_vocab[m]=i

len(movies_vocab)

3599

In [8]:
from src.utils.tokenizer import MultiAppBehaviorTokenizer

tokenizer = MultiAppBehaviorTokenizer(vocab=movies_vocab)

In [9]:
train_seq_embeddings_path = '../artifacts/movielens_embeddings/x_train_embeddings.pt' 
val_seq_embeddings_path = '../artifacts/movielens_embeddings/x_val_embeddings.pt'
test_seq_embeddings_path = '../artifacts/movielens_embeddings/x_test_embeddings.pt'

x_train = torch.tensor(torch.load(train_seq_embeddings_path), dtype=torch.float32)
x_val = torch.tensor(torch.load(val_seq_embeddings_path), dtype=torch.float32)
x_test = torch.tensor(torch.load(test_seq_embeddings_path), dtype=torch.float32)

In [10]:
y_train_path = '../artifacts/movielens_embeddings/y_train.pt'
y_val_path = '../artifacts/movielens_embeddings/y_val.pt'
y_test_path = '../artifacts/movielens_embeddings/y_test.pt'

y_train = torch.load(y_train_path)
y_val = torch.load(y_val_path)
y_test = torch.load(y_test_path)

In [11]:
y_test[:10]

tensor([1358, 1181, 2883, 1574, 2222,   58, 2591, 2141, 1086,  407, 3578, 2790,
         826, 2635, 2111,  547, 1130, 2111, 1499, 1976, 1598,  106, 2871,  553,
        2786, 1847, 2106, 1312, 2828, 1920, 2424, 2879,  376, 1471, 2605, 3474,
        2420,  551, 1920, 2811, 2830, 2123, 2761,  324, 2850, 1181, 1863, 1922,
        1293, 1014, 2514, 1279, 2023, 2436, 1090,  277, 2408,   52, 1810, 2998,
         273, 1863, 3289, 3278, 3580, 3498, 2605, 1428,  545,  142, 1881,  928,
        1605, 3234,  854, 1721, 2679, 2866,  620, 1764, 1922, 3002, 1043,   74,
        2677,  448, 1774, 2661,  883,  579, 3127, 2374, 2938,  883,  142, 3494,
        1863, 3109, 1661,  308, 1176, 1641, 2376, 2518,  362, 1742, 2249,  286,
         145, 1805, 2342, 1461,  207, 2818,  225, 3299, 2537, 2922,  240, 3339,
         204,  308, 2514, 2792, 1771, 2514, 3117, 2010, 3523,  114, 1976,  637,
        2084, 1669, 2937, 1219, 1863,  301,  896, 2846, 1777,  768, 2977,  981,
        1396,  955, 3574, 2957,  115, 24

In [12]:
x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

(torch.Size([695951, 512]),
 torch.Size([301, 512]),
 torch.Size([603, 512]),
 torch.Size([695951]),
 torch.Size([301]),
 torch.Size([603]))

# Create loaders

In [13]:
train_batch_size = 256
val_batch_size = 128

In [14]:
from torch.utils.data import DataLoader, TensorDataset

g = torch.Generator().manual_seed(42)

train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=False, generator=g)

val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=val_batch_size, shuffle=False, generator=g)


In [15]:
from src.models.mlp_architecture import MLPNextItemPrediction
from src.utils.general import count_model_parameters, set_seed

set_seed(42)
rec_model = MLPNextItemPrediction(recs_size=len(movies_vocab)+1)
count_model_parameters(rec_model)


    Total model parameters: 456655
    Total trainable parameters: 456655
    Total non-trainable parameters: 0
    


(456655, 456655, 0)

# Train

In [1]:
from src.utils.training_functions import train_model

learning_rate = 3e-4
batch_size = 256
epochs = 60
set_seed(42)
rec_model = MLPNextItemPrediction(recs_size=len(movies_vocab)+1)
device = torch.device("mps")

best_val = train_model(model=rec_model,
                       train_loader=train_loader,
                       val_loader=val_loader,
                       learning_rate=learning_rate,
                       epochs=epochs, checkpoints_dir_path='/content', 
                       save_checkpoints=False, apply_scheduler=True, device=device)