In [2]:
import torch
import numpy as np
import pandas as pd 
import pickle

In [3]:
# Leer data/train_data.csv
train_data = pd.read_csv('data/train_data.csv')

In [4]:
# Leer el pickle embeddings_train.pkl
with open('data/embeddings_train.pkl', 'rb') as f:
    embeddings_dict_train = pickle.load(f)

In [5]:
# Convertir el diccionario de embeddings a una matriz
track_ids = list(embeddings_dict_train.keys())
embeddings_matrix = np.array([embeddings_dict_train[tid] for tid in track_ids])

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=104)
embeddings_matrix_reduced = pca.fit_transform(embeddings_matrix)

In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Supongamos que train_data es tu DataFrame con 'pid' y 'tid'
# Y embeddings_dict_train es tu diccionario de embeddings

# Crear una lista de listas donde cada sublista contiene los track ids de una playlist
playlist_tracks = train_data.groupby('pid')['tid'].apply(list).tolist()

# Crear una matriz de interacción (playlists x tracks)
mlb = MultiLabelBinarizer()
interaction_matrix = mlb.fit_transform(playlist_tracks)

# Convertir a DataFrame para facilitar el manejo
interaction_df = pd.DataFrame(interaction_matrix, columns=mlb.classes_)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from annoy import AnnoyIndex
import numpy as np
import gc

# Parámetros
embedding_dim = embeddings_matrix_reduced.shape[1]  # Dimensión de los embeddings reducidos
hidden_dim = 128
batch_size = 64
num_epochs = 50
learning_rate = 1e-3

# Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [8]:
# Convertir embeddings_matrix_reduced a tensor
embeddings_tensor = torch.tensor(embeddings_matrix_reduced, dtype=torch.float32)

# Crear DataLoader
dataset = TensorDataset(embeddings_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inicializar el modelo, criterio y optimizador
model = Autoencoder(embedding_dim, hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [9]:
# Entrenamiento del autoencoder
model.train()
for epoch in range(num_epochs):
    for data in dataloader:
        inputs = data[0]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Guardar el modelo entrenado
torch.save(model.state_dict(), 'autoencoder.pth')

Epoch [1/50], Loss: 0.0739
Epoch [2/50], Loss: 0.0882
Epoch [3/50], Loss: 0.0701
Epoch [4/50], Loss: 0.0784
Epoch [5/50], Loss: 0.0738
Epoch [6/50], Loss: 0.0706
Epoch [7/50], Loss: 0.0758
Epoch [8/50], Loss: 0.0737
Epoch [9/50], Loss: 0.0674
Epoch [10/50], Loss: 0.0767
Epoch [11/50], Loss: 0.0764
Epoch [12/50], Loss: 0.0788
Epoch [13/50], Loss: 0.0767
Epoch [14/50], Loss: 0.0803
Epoch [15/50], Loss: 0.0750
Epoch [16/50], Loss: 0.0656
Epoch [17/50], Loss: 0.0606
Epoch [18/50], Loss: 0.0733
Epoch [19/50], Loss: 0.0606
Epoch [20/50], Loss: 0.0666
Epoch [21/50], Loss: 0.0717
Epoch [22/50], Loss: 0.0813
Epoch [23/50], Loss: 0.0758
Epoch [24/50], Loss: 0.0943
Epoch [25/50], Loss: 0.0664
Epoch [26/50], Loss: 0.0646
Epoch [27/50], Loss: 0.0803
Epoch [28/50], Loss: 0.0797
Epoch [29/50], Loss: 0.0709
Epoch [30/50], Loss: 0.0693
Epoch [31/50], Loss: 0.0753
Epoch [32/50], Loss: 0.0731
Epoch [33/50], Loss: 0.0710
Epoch [34/50], Loss: 0.0690
Epoch [35/50], Loss: 0.0599
Epoch [36/50], Loss: 0.0681
E

In [8]:
# Cargar modelo entrenado
model = Autoencoder(embedding_dim, hidden_dim)
model.load_state_dict(torch.load('autoencoder.pth'))
model.eval()


Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=104, out_features=128, bias=True)
    (1): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=104, bias=True)
    (1): ReLU()
  )
)

In [9]:
# Obtener las representaciones comprimidas (embeddings de playlists)
def get_playlist_embeddings(playlist_track_ids):
    track_embeddings = embeddings_matrix[playlist_track_ids]
    track_embeddings_tensor = torch.tensor(track_embeddings, dtype=torch.float32)
    with torch.no_grad():
        playlist_embedding = model.encoder(track_embeddings_tensor)
    return playlist_embedding.mean(dim=0).numpy()

In [10]:
# Construir el índice Annoy
def build_annoy_index(embeddings_tensor, batch_size=1000):
    index = AnnoyIndex(hidden_dim, 'angular')
    num_batches = len(embeddings_tensor) // batch_size + 1

    for batch in range(num_batches):
        start = batch * batch_size
        end = min((batch + 1) * batch_size, len(embeddings_tensor))
        batch_embeddings = embeddings_tensor[start:end]

        for i, embedding in enumerate(batch_embeddings):
            try:
                with torch.no_grad():
                    compressed_embedding = model.encoder(embedding.unsqueeze(0)).squeeze()
                index.add_item(start + i, compressed_embedding.numpy())
            except Exception as e:
                print(f"Error en batch {batch + 1}, índice {i}: {str(e)}")
        
        # Liberar memoria de los embeddings y el modelo en cada lote
        del batch_embeddings
        gc.collect()
        
        print(f'Procesado lote {batch + 1}/{num_batches}')

    index.build(100)  # Ajustar el número de árboles según sea necesario
    return index

# Construir y guardar el índice Annoy
try:
    index = build_annoy_index(embeddings_tensor)
    index.save('annoy_index.ann')
except Exception as e:
    print(f"Error al construir el índice Annoy: {str(e)}")

Procesado lote 1/147
Procesado lote 2/147
Procesado lote 3/147
Procesado lote 4/147
Procesado lote 5/147
Procesado lote 6/147
Procesado lote 7/147
Procesado lote 8/147
Procesado lote 9/147
Procesado lote 10/147
Procesado lote 11/147
Procesado lote 12/147
Procesado lote 13/147
Procesado lote 14/147
Procesado lote 15/147
Procesado lote 16/147
Procesado lote 17/147
Procesado lote 18/147
Procesado lote 19/147
Procesado lote 20/147
Procesado lote 21/147
Procesado lote 22/147
Procesado lote 23/147
Procesado lote 24/147
Procesado lote 25/147
Procesado lote 26/147
Procesado lote 27/147
Procesado lote 28/147
Procesado lote 29/147
Procesado lote 30/147
Procesado lote 31/147
Procesado lote 32/147
Procesado lote 33/147
Procesado lote 34/147
Procesado lote 35/147
Procesado lote 36/147
Procesado lote 37/147
Procesado lote 38/147
Procesado lote 39/147
Procesado lote 40/147
Procesado lote 41/147
Procesado lote 42/147
Procesado lote 43/147
Procesado lote 44/147
Procesado lote 45/147
Procesado lote 46/1

: 