In [1]:
!pip install torch numpy pandas scikit-learn annoy implicit nearpy lshashpy3

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nearpy
  Downloading NearPy-1.0.0-py2.py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lshashpy3
  Downloading lshashpy3-0.0.9.tar.gz (9.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached 

In [2]:
import torch
import numpy as np
import pandas as pd
import pickle
from sklearn.decomposition import PCA
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from annoy import AnnoyIndex
import gc
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import MultiLabelBinarizer
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
import json

# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Rutas a los archivos en tu Google Drive
data_folder = '/content/drive/MyDrive/RecSys/'

# Leer data/train_data.csv
train_data = pd.read_csv(data_folder + 'data/train_data.csv')

# Leer el pickle embeddings_train.pkl
with open(data_folder + 'data/embeddings_train.pkl', 'rb') as f:
    embeddings_dict_train = pickle.load(f)

# Convertir diccionario de embeddings a una matriz
track_ids = list(embeddings_dict_train.keys())
embeddings_matrix = np.array([embeddings_dict_train[tid] for tid in track_ids])

# Realizar PCA con 104 componentes (capturando el 95% de la varianza)
pca = PCA(n_components=104)
embeddings_matrix_reduced = pca.fit_transform(embeddings_matrix)

In [None]:
# Parámetros
hidden_dim = 128
batch_size = 64
num_epochs = 50
learning_rate = 1e-3

# Definición del Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# Convertir embeddings_matrix_reduced a tensor
embeddings_tensor = torch.tensor(embeddings_matrix_reduced, dtype=torch.float32)

# Crear DataLoader
dataset = TensorDataset(embeddings_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inicializar el modelo, criterio y optimizador
model = Autoencoder(input_dim=104, hidden_dim=hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Entrenamiento del autoencoder
for epoch in range(num_epochs):
    for data in dataloader:
        inputs = data[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Guardar el modelo entrenado
torch.save(model.state_dict(), data_folder + 'autoencoder.pth')

In [None]:
# Cargar el modelo entrenado
model = Autoencoder(input_dim=104, hidden_dim=hidden_dim)
model.load_state_dict(torch.load(data_folder + 'autoencoder.pth'))
model.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=104, out_features=128, bias=True)
    (1): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=104, bias=True)
    (1): ReLU()
  )
)

In [None]:
# Construir el índice Annoy para los embeddings de canciones
def build_annoy_index(embeddings_tensor, batch_size=10000):
    index = AnnoyIndex(hidden_dim, 'angular')
    num_batches = len(embeddings_tensor) // batch_size + 1

    for batch in range(num_batches):
        start = batch * batch_size
        end = min((batch + 1) * batch_size, len(embeddings_tensor))
        batch_embeddings = embeddings_tensor[start:end]

        for i, embedding in enumerate(batch_embeddings):
            with torch.no_grad():
                compressed_embedding = model.encoder(embedding.unsqueeze(0)).squeeze()
            index.add_item(start + i, compressed_embedding.numpy())

        # Liberar memoria de los embeddings y el modelo en cada lote
        del batch_embeddings
        gc.collect()

        print(f'Processed batch {batch + 1}/{num_batches}')

    index.build(10)  # Ajustar el número de árboles según sea necesario
    return index

# Construir y guardar el índice Annoy
index = build_annoy_index(embeddings_tensor)
index.save(data_folder + 'annoy_index.ann')

In [None]:
# Abrir index de data_folder + 'annoy_index.ann'
index = AnnoyIndex(hidden_dim, 'angular')
index.load(data_folder + 'annoy_index.ann')

True

In [None]:
track_id_to_index = {track_id: i for i, track_id in enumerate(track_ids)}

In [None]:
# Función para obtener embeddings de playlists
def get_playlist_embeddings(playlist_track_ids):
    valid_track_ids = [tid for tid in playlist_track_ids if tid in track_id_to_index]
    valid_indices = [track_id_to_index[tid] for tid in valid_track_ids]
    track_embeddings = embeddings_matrix_reduced[valid_indices]  # Usar embeddings reducidos
    track_embeddings_tensor = torch.tensor(track_embeddings, dtype=torch.float32)

    # Calcular el embedding de la playlist
    with torch.no_grad():
        playlist_embedding = model.encoder(track_embeddings_tensor)

    # Asegurar que el embedding tenga la dimensión correcta
    playlist_embedding = playlist_embedding.mean(dim=0)

    return playlist_embedding.numpy()

# Función para recomendar canciones a una playlist
def recommend_songs(playlist_id, top_n=10):
    playlist_track_ids = train_data[train_data['pid'] == playlist_id]['tid'].values
    playlist_embedding = get_playlist_embeddings(playlist_track_ids)
    similar_items = index.get_nns_by_vector(playlist_embedding, top_n)
    return [int(track_ids[i]) for i in similar_items]  # Convertir a int para asegurar serialización

# Diccionario para almacenar las recomendaciones
recommendations_dict = {}

# Iterar sobre todos los pid en train_data
for playlist_id in train_data['pid'].unique():
    try:
        recommendations_dict[str(playlist_id)] = {
            'top_10': recommend_songs(playlist_id, top_n=10),
            'top_20': recommend_songs(playlist_id, top_n=20)
        }
    except Exception as e:
        print(f"Error al procesar la playlist {playlist_id}: {e}")

# Convertir a tipos nativos de Python para serialización JSON
recommendations_dict = {k: v if isinstance(v, (int, list, dict)) else v.item() for k, v in recommendations_dict.items()}

# Guardar las recomendaciones en un archivo JSON
with open(data_folder + 'data/recommendations_ann.json', 'w') as f:
    json.dump(recommendations_dict, f)

print("Recomendaciones guardadas en recommendations_ann.json")

Recomendaciones guardadas en recommendations_ann.json


ALS

In [None]:
# Crear una lista de listas donde cada sublista contiene los track ids de una playlist
playlist_tracks = train_data.groupby('pid')['tid'].apply(list).tolist()

# Crear una matriz de interacción (playlists x tracks)
mlb = MultiLabelBinarizer()
interaction_matrix = mlb.fit_transform(playlist_tracks)
interaction_matrix_sparse = coo_matrix(interaction_matrix)

# Obtener los índices de las playlists y las canciones
playlist_ids = train_data['pid'].unique()
track_ids = mlb.classes_

# Crear un diccionario para mapear track ids a índices
track_id_to_idx = {track_id: idx for idx, track_id in enumerate(track_ids)}

# Crear un diccionario para mapear pid a índices
playlist_id_to_idx = {pid: idx for idx, pid in enumerate(playlist_ids)}

In [None]:
# Configurar y entrenar el modelo ALS
model = AlternatingLeastSquares(factors=128, regularization=0.1, iterations=50)
model.fit(interaction_matrix_sparse.T)  # Transponemos la matriz para obtener factores de canciones

In [None]:
# Guardar el modelo als
with open(data_folder + 'als_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Cargar el modelo als
with open(data_folder + 'als_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Función para recomendar canciones a una playlist utilizando ALS
def recommend_songs_als(playlist_id, top_n=10):
    # Obtener el índice de la playlist
    playlist_idx = playlist_id_to_idx.get(playlist_id)

    if playlist_idx is None:
        print(f'Playlist con id {playlist_id} no encontrada.')
        return []

    # Obtener las recomendaciones utilizando el modelo ALS
    playlist_factors = model.user_factors[playlist_idx]
    scores = np.dot(model.item_factors, playlist_factors)
    top_song_indices = np.argsort(-scores)[:top_n]
    top_song_ids = [track_ids[idx] for idx in top_song_indices]

    return top_song_ids

# Diccionario para almacenar las recomendaciones
recommendations_dict = {}

# Iterar sobre todos los pid en train_data
for playlist_id in train_data['pid'].unique():
    recommendations_dict[str(playlist_id)] = {
        'top_10': [int(song_id) for song_id in recommend_songs_als(playlist_id, top_n=10)],
        'top_20': [int(song_id) for song_id in recommend_songs_als(playlist_id, top_n=20)]
    }

# Guardar las recomendaciones en un archivo JSON
with open(data_folder + 'data/recommendations_als.json', 'w') as f:
    json.dump(recommendations_dict, f)

print("Recomendaciones guardadas en recommendations_als.json")

Recomendaciones guardadas en recommendations_als.json


LSH

In [None]:
# Crear una lista de listas donde cada sublista contiene los track ids de una playlist
playlist_tracks = train_data.groupby('pid')['tid'].apply(list).tolist()

# Crear un diccionario para mapear track_id a índice en embeddings_matrix_reduced
track_ids = train_data['tid'].unique()
track_id_to_index = {track_id: i for i, track_id in enumerate(track_ids)}

# Configurar el motor de LSH con hash binario aleatorio
dimension = embeddings_matrix_reduced.shape[1]  # Dimensión de los embeddings reducidos
num_hashes = 10  # Número de funciones hash a utilizar
lsh_engine = LSHash(num_hashes, dimension)

# Indexar los embeddings reducidos con LSH
for track_id, embedding in zip(track_ids, embeddings_matrix_reduced):
    lsh_engine.index(embedding, extra_data=f'track_{track_id_to_index[track_id]}')

In [None]:
# Función para buscar vecinos cercanos usando LSH
def find_nearest_neighbors(embedding, lsh_engine, top_n=10):
    query = lsh_engine.query(embedding, num_results=top_n)
    nearest_neighbors = [int(result[0][1].split('_')[1]) for result in query]
    return nearest_neighbors

# Función para recomendar canciones a una playlist específica usando LSH
def recommend_songs_to_playlist(pid, embeddings_matrix_reduced, lsh_engine, top_n=10):
    # Obtener los track_ids asociados a la playlist pid
    playlist_track_ids = train_data[train_data['pid'] == pid]['tid'].values

    # Filtrar los track_ids que están dentro del rango válido de embeddings_matrix_reduced
    valid_track_ids = [tid for tid in playlist_track_ids if tid in track_id_to_index]

    # Obtener los embeddings reducidos de los tracks de la playlist
    playlist_embeddings = [embeddings_matrix_reduced[track_id_to_index[tid]] for tid in valid_track_ids]

    # Promedio de embeddings de tracks en la playlist
    playlist_embedding = np.mean(playlist_embeddings, axis=0)

    # Buscar vecinos cercanos usando LSH
    nearest_neighbors = find_nearest_neighbors(playlist_embedding, lsh_engine, top_n=top_n)

    return nearest_neighbors


In [None]:
# Diccionario para almacenar las recomendaciones
recommendations_dict = {}

# Iterar sobre todos los pid en train_data
for playlist_id in train_data['pid'].unique():
    recommendations_dict[str(playlist_id)] = {
        'top_10': [int(song_id) for song_id in recommend_songs_to_playlist(playlist_id, embeddings_matrix_reduced, lsh_engine, top_n=10)],
        'top_20': [int(song_id) for song_id in recommend_songs_to_playlist(playlist_id, embeddings_matrix_reduced, lsh_engine, top_n=20)]
    }

# Guardar las recomendaciones en un archivo JSON
with open(data_folder + 'data/recommendations_lsh.json', 'w') as f:
    json.dump(recommendations_dict, f)

print("Recomendaciones guardadas en recommendations_lsh.json")

Recomendaciones guardadas en recommendations_lsh.json


Random

In [4]:
# Obtener todos los track_ids únicos
track_ids = train_data['tid'].unique()

# Función para recomendar canciones de forma aleatoria
def recommend_random_songs(track_ids, top_n=10):
    return np.random.choice(track_ids, top_n, replace=False).tolist()

# Diccionario para almacenar las recomendaciones aleatorias
random_recommendations_dict = {}

# Iterar sobre todos los pid en train_data
for playlist_id in train_data['pid'].unique():
    random_recommendations_dict[str(playlist_id)] = {
        'top_10': [int(song_id) for song_id in recommend_random_songs(track_ids, top_n=10)],
        'top_20': [int(song_id) for song_id in recommend_random_songs(track_ids, top_n=20)]
    }

# Guardar las recomendaciones aleatorias en un archivo JSON
with open(data_folder + 'data/recommendations_random.json', 'w') as f:
    json.dump(random_recommendations_dict, f)

print("Recomendaciones aleatorias guardadas en recommendations_random.json")


Recomendaciones aleatorias guardadas en recommendations_random.json
