In [1]:
import os
import json
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
# import kagglehub
import zipfile
import matplotlib.pyplot as plt
from IPython.display import FileLink, display
%matplotlib inline

# --- Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# --- Data Loading ---
# Impostiamo i parametri per il caricamento
num_slices = 1000      # numero di slice da caricare
num_playlists = 1000   # numero di playlist per ogni slice



  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [9]:

# Specifica un percorso personalizzato (modifica secondo le tue esigenze)
data_dir = "data/"
# Scarica il file ZIP
!gdown "1jj_ApW79I2n2n4skXmXrNWVWroOlaEuM" -O data_zipped.zip

Downloading...
From (original): https://drive.google.com/uc?id=1jj_ApW79I2n2n4skXmXrNWVWroOlaEuM
From (redirected): https://drive.google.com/uc?id=1jj_ApW79I2n2n4skXmXrNWVWroOlaEuM&confirm=t&uuid=0e705b24-7b8d-4f58-969c-012c5fcc47c1
To: /Users/andreagentilini/Downloads/data_zipped.zip
100%|██████████████████████████████████████| 5.75G/5.75G [01:25<00:00, 67.2MB/s]


KeyboardInterrupt: 

In [None]:
# Estrai il contenuto nella directory specificata
with zipfile.ZipFile("data_zipped.zip", 'r') as zip_ref:
    zip_ref.extractall(data_dir)  

In [None]:
data_dir = os.path.join(data_dir, 'data')

In [None]:
# Lista dei file estratti nella cartella specificata
slices = sorted(os.listdir(data_dir))[:num_slices]

print(f"Caricati {len(slices)} file dalla cartella '{data_dir}':")

In [None]:
playlists = []
for slice_file in slices:
    with open(os.path.join(data_dir, slice_file), "r") as f:
        data = json.load(f)
    # Aggiungiamo solo le prime num_playlists playlist per ogni slice
    playlists.extend(data['playlists'][:num_playlists])

# --- Preprocessing: estrazione della feature "track_name" ---
playlists_tracks = [
    [track['track_name'] for track in playlist['tracks']]
    for playlist in playlists
]

# --- Costruzione del vocabolario ---
unique_tracks = {track for playlist in playlists_tracks for track in playlist}
if '.' in unique_tracks:
    unique_tracks.remove('.')
# Il token '.' (stop/start) avrà indice 0
stoi = {track: i+1 for i, track in enumerate(sorted(unique_tracks))}
stoi['.'] = 0
itos = {i: track for track, i in stoi.items()}
vocab_size = len(itos)
print("Vocab size:", vocab_size)

In [None]:

# --- Creazione del dataset per il language model ---
block_size = 5  # lunghezza del contesto
X_data, Y_data = [], []
for playlist in playlists_tracks:
    context = [0] * block_size  # inizializziamo il contesto con il token di stop
    # Aggiungiamo il token di stop alla fine della playlist
    for track in playlist + ['.']:
        ix = stoi[track]
        X_data.append(context.copy())
        Y_data.append(ix)
        # Aggiorniamo il contesto: shift a sinistra e aggiungiamo l'indice corrente
        context = context[1:] + [ix]

X = torch.tensor(X_data, dtype=torch.long)
Y = torch.tensor(Y_data, dtype=torch.long)
print("Dataset shape:", X.shape, Y.shape)




In [None]:
# --- Suddivisione train/val/test ---
indices = torch.randperm(X.shape[0])
X = X[indices]
Y = Y[indices]
n_total = X.shape[0]
n_train = int(0.8 * n_total)
n_val   = int(0.1 * n_total)

Xtr, Ytr = X[:n_train], Y[:n_train]
Xval, Yval = X[n_train:n_train+n_val], Y[n_train:n_train+n_val]
Xte, Yte = X[n_train+n_val:], Y[n_train+n_val:]
print("Training samples:", Xtr.shape[0])
print("Validation samples:", Xval.shape[0])
print("Test samples:", Xte.shape[0])

In [None]:
# --- Definizione del modello ---
class PlaylistModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_hidden):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embd)
        self.block_size = block_size
        self.mlp = nn.Sequential(
            nn.Linear(n_embd * block_size, n_hidden, bias=False),
            nn.BatchNorm1d(n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden, bias=False),
            nn.BatchNorm1d(n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden, bias=False),
            nn.BatchNorm1d(n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden, bias=False),
            nn.BatchNorm1d(n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden, bias=False),
            nn.BatchNorm1d(n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, vocab_size, bias=False),
            nn.BatchNorm1d(vocab_size)
        )
        # Riduciamo la scala del layer BN finale
        final_bn = self.mlp[-1]
        final_bn.weight.data.mul_(0.1)
    
    def forward(self, x):
        emb = self.embedding(x)                   # -> (batch_size, block_size, n_embd)
        emb = emb.view(emb.size(0), -1)             # -> (batch_size, block_size * n_embd)
        logits = self.mlp(emb)                      # -> (batch_size, vocab_size)
        return logits



In [None]:
n_embd = 100
n_hidden = 100
model = PlaylistModel(vocab_size, n_embd, block_size, n_hidden).to(device)
print("Numero di parametri:", sum(p.numel() for p in model.parameters()))



In [None]:
# --- Ottimizzatore e DataLoader ---
batch_size = 64
train_dataset = TensorDataset(Xtr, Ytr)
val_dataset   = TensorDataset(Xval, Yval)
test_dataset  = TensorDataset(Xte, Yte)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
numb_step_change = 1500

@torch.no_grad()
def evaluate_loss(loader):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = F.cross_entropy(logits, yb, reduction='sum')
        total_loss += loss.item()
        total_samples += xb.size(0)
    model.train()
    return total_loss / total_samples



In [None]:
# --- Training ---
max_steps = 2000
log_interval = 100
step = 0
train_loss_history = []
val_loss_history = []
steps_list = []

model.train()

while step < max_steps:
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)
        loss.backward()
        optimizer.step()

        train_loss_history.append(loss.item())
        step += 1

        if step == numb_step_change:
            for param_group in optimizer.param_groups:
                param_group['lr'] = 0.01

        if step % log_interval == 0:
            print('ok')
            val_loss = evaluate_loss(val_loader)
            steps_list.append(step)
            val_loss_history.append(val_loss)
            print(f'{step:7d}/{max_steps:7d}: Train loss = {loss.item():.4f}, Val loss = {val_loss:.4f}')

        if step >= max_steps:
            break



In [None]:
# --- Plot di train e validation loss ---
plt.figure(figsize=(10, 6))
plt.plot(train_loss_history, label="Train Loss", alpha=0.6)
plt.plot(steps_list, val_loss_history, 'ro-', label="Validation Loss", markersize=5)
plt.xlabel("Steps")
plt.ylabel("Cross Entropy Loss")
plt.title("Training e Validation Loss")
plt.legend()
plt.show()

# --- Valutazione finale sui dataset ---
print("Train Loss:", evaluate_loss(train_loader))
print("Val Loss:", evaluate_loss(val_loader))



In [None]:
# --- Calcolo delle metriche sul test set ---
k_values = [1, 2, 3, 5]
model.eval()

all_logits = []
all_targets = []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        logits = model(xb)
        all_logits.append(logits.cpu())
        all_targets.append(yb.cpu())
all_logits = torch.cat(all_logits, dim=0)
all_targets = torch.cat(all_targets, dim=0)

test_loss = F.cross_entropy(all_logits, all_targets).item()
print("Test Loss:", test_loss)

probs = F.softmax(all_logits, dim=1)
ranking = torch.argsort(probs, dim=1, descending=True)

precision_at_k = {k: 0.0 for k in k_values}
recall_at_k = {k: 0.0 for k in k_values}
N = all_targets.shape[0]

for k in k_values:
    topk = ranking[:, :k]
    correct = (topk == all_targets.unsqueeze(1)).any(dim=1).float()
    precision_at_k[k] = (correct / k).mean().item()
    recall_at_k[k] = correct.mean().item()

for k in k_values:
    print(f"Precision@{k}: {precision_at_k[k]:.4f}, Recall@{k}: {recall_at_k[k]:.4f}")

true_logits = all_logits[torch.arange(N), all_targets].unsqueeze(1)
ranks = (all_logits >= true_logits).sum(dim=1).float()
mrr = (1.0 / ranks).mean().item()
print(f"MRR: {mrr:.4f}")

In [None]:
"""
implementare hyperparameter tuning su n\_embd, n\_hidden e il batch size, 
dato che questi parametri controllano la capacità del modello e l’efficienza del training. 
Inizia definendo uno spazio di ricerca ragionevole (ad es. n\_embd \in \{50, 100, 200, 300\}, n\_hidden \in \{100, 200, 300, 400\}, batch size \in \{32, 64, 128\}) 
e valuta l’adozione di tecniche come la grid search e random search. 
Utilizza un set di validazione affidabile e monitora sia il loss che le metriche specifiche del task (come Precision@k e MRR) per guidare le tue scelte.
"""
pass

In [None]:
# --- Salvataggio del modello trainato ---
model_path = "playlist_model.pth"
torch.save(model.state_dict(), model_path)
print("Modello salvato in:", model_path)

In [None]:
# --- Creazione di un link per il download del file ---
# Soluzione 1: usando IPython.display.FileLink
print("Clicca sul link sottostante per scaricare il modello:")
display(FileLink(model_path))

# Soluzione 2 (alternativa): se fossi in Google Colab, puoi usare:
# from google.colab import files
# files.download(model_path)