# Predicción de relaciones entre prendas con GNNs (Link Prediction + Inductive Cold-Start)


In [1]:
import os
import random
import numpy as np
import pandas as pd
import scipy.sparse as sp
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
import re




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\marke\.conda\envs\dm3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\marke\.conda\envs\dm3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\marke\.conda\envs\dm3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\marke\.cond

In [2]:

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)


device: cpu


## 1) Cargar nodos y aristas

In [3]:
NODES_PATH = 'Datos alumnos/Trasformados/season7_nodes.csv'
EDGES_PATH = 'Datos alumnos/Trasformados/season7_edges.csv'

nodes_df = pd.read_csv(NODES_PATH)
edges_df = pd.read_csv(EDGES_PATH)

# id como string para mapear consistente
nodes_df['node_id'] = nodes_df['node_id'].astype(str)
edges_df['source'] = edges_df['source'].astype(str)
edges_df['target'] = edges_df['target'].astype(str)

print('Nodos:', nodes_df.shape)
print('Aristas:', edges_df.shape)

display(nodes_df.head(3))
display(edges_df.head(3))


Nodos: (2000, 18)
Aristas: (259708, 4)


Unnamed: 0.1,node_id,Unnamed: 0,color_name,product_name,season_code,adventurous,application,composition,cut,style,weather,nivel,print,weather_norm,risk_score,style_code,cut_group,print_group
0,3,3,red,Abby Dress caribbean,7,three,work,viscose,waist_cut,classic,warm,1,printed,W,3,CL,D,E
1,4,4,black,Abelone Playsuit miniprint,7,four,freetime,viscose,waist_cut,boho,warm,1,sheets,W,4,B,D,A
2,8,8,black,Acacia Jacket ward,7,two,work,cotton,contour_darts,classic,warm_season,3,smooth,W,2,CL,A,A


Unnamed: 0,source,target,rule,weight_color
0,3,8,nivel,7
1,3,72,nivel,7
2,3,75,nivel,7


## 2) Preprocesado de features X

In [4]:
exclude = {'node_id'}
feature_cols = [c for c in nodes_df.columns if c not in exclude]

numeric_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(nodes_df[c])]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'
)

X_np = preprocess.fit_transform(nodes_df[feature_cols])
# to dense float32 (si es sparse)
if hasattr(X_np, 'toarray'):
    X_np = X_np.toarray()
X = torch.tensor(X_np, dtype=torch.float32, device=device)

print('X shape:', X.shape)


X shape: torch.Size([2000, 1256])


## 3) Construir grafo (adjacency)

In [5]:
node_ids = nodes_df['node_id'].tolist()
id2idx = {nid: i for i, nid in enumerate(node_ids)}
idx2id = {i: nid for nid, i in id2idx.items()}

# Filtrar edges válidas
edges_df = edges_df[edges_df['source'].isin(id2idx) & edges_df['target'].isin(id2idx)].copy()

edge_pairs = np.vstack([
    edges_df['source'].map(id2idx).to_numpy(),
    edges_df['target'].map(id2idx).to_numpy()
]).T

N = len(node_ids)

def build_adj_undirected(edge_pairs, n_nodes):
    # undirected + remove self loops (se añadirán luego)
    u = edge_pairs[:,0]
    v = edge_pairs[:,1]
    mask = u != v
    u, v = u[mask], v[mask]

    rows = np.concatenate([u, v])
    cols = np.concatenate([v, u])
    data = np.ones(len(rows), dtype=np.float32)
    return sp.csr_matrix((data, (rows, cols)), shape=(n_nodes, n_nodes))

A = build_adj_undirected(edge_pairs, N)
print('Adj shape:', A.shape, 'nnz:', A.nnz)


Adj shape: (2000, 2000) nnz: 519416


In [6]:
def get_positive_edges_from_adj(A_csr):
    A_coo = A_csr.tocoo()
    edges = np.vstack([A_coo.row, A_coo.col]).T
    edges = edges[edges[:,0] < edges[:,1]]
    return edges

pos_edges = get_positive_edges_from_adj(A)
print('pos_edges:', pos_edges.shape)

# muestreo de negativos
edge_set = set(map(tuple, pos_edges.tolist()))

def sample_negative_edges(num_samples, n_nodes, edge_set, seed=SEED):
    rng = np.random.default_rng(seed)
    neg = []
    while len(neg) < num_samples:
        u = int(rng.integers(0, n_nodes))
        v = int(rng.integers(0, n_nodes))
        if u == v:
            continue
        a, b = (u, v) if u < v else (v, u)
        if (a, b) in edge_set:
            continue
        neg.append((a, b))
    return np.array(neg, dtype=np.int64)

neg_edges = sample_negative_edges(len(pos_edges), N, edge_set)

# splits
rng = np.random.default_rng(SEED)
perm = rng.permutation(len(pos_edges))
pos_edges = pos_edges[perm]
neg_edges = neg_edges[perm]

n_test = int(0.2 * len(pos_edges))
n_val  = int(0.1 * len(pos_edges))

pos_test, neg_test = pos_edges[:n_test], neg_edges[:n_test]
pos_val,  neg_val  = pos_edges[n_test:n_test+n_val], neg_edges[n_test:n_test+n_val]
pos_train,neg_train= pos_edges[n_test+n_val:], neg_edges[n_test+n_val:]

print('train/val/test:', len(pos_train), len(pos_val), len(pos_test))

def edges_to_torch(edges):
    return torch.tensor(edges, dtype=torch.long, device=device)

pos_train_t = edges_to_torch(pos_train)
neg_train_t = edges_to_torch(neg_train)
pos_val_t   = edges_to_torch(pos_val)
neg_val_t   = edges_to_torch(neg_val)
pos_test_t  = edges_to_torch(pos_test)
neg_test_t  = edges_to_torch(neg_test)

# Build adjacency ONLY from train positive edges
A_train = build_adj_undirected(pos_train, N)


pos_edges: (259708, 2)
train/val/test: 181797 25970 51941


## 5) Normalización + sparse torch

In [7]:
def normalize_gcn(A_csr):
    A_hat = A_csr + sp.eye(A_csr.shape[0], dtype=np.float32)
    deg = np.array(A_hat.sum(axis=1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.
    D_inv_sqrt = sp.diags(deg_inv_sqrt)
    return D_inv_sqrt @ A_hat @ D_inv_sqrt


def to_torch_sparse(A_csr):
    A_coo = A_csr.tocoo()
    indices = torch.tensor(np.vstack([A_coo.row, A_coo.col]), dtype=torch.long, device=device)
    values = torch.tensor(A_coo.data, dtype=torch.float32, device=device)
    return torch.sparse_coo_tensor(indices, values, size=A_coo.shape).coalesce()

A_gcn = normalize_gcn(A_train)
A_gcn_t = to_torch_sparse(A_gcn)
print('A_gcn_t:', A_gcn_t.shape, 'nnz:', A_gcn_t._nnz())


A_gcn_t: torch.Size([2000, 2000]) nnz: 365594


## 6) Modelos: Encoders (GCN / GraphSAGE / GAT)

In [8]:
class GCN(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hid_dim)
        self.fc2 = nn.Linear(hid_dim, out_dim)
        self.dropout = dropout

    def forward(self, x, adj):
        h = torch.sparse.mm(adj, x)
        h = F.relu(self.fc1(h))
        h = F.dropout(h, p=self.dropout, training=self.training)
        h = torch.sparse.mm(adj, h)
        z = self.fc2(h)
        return z


class GraphSAGEMean(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(in_dim * 2, hid_dim)
        self.fc2 = nn.Linear(hid_dim * 2, out_dim)
        self.dropout = dropout

    def forward(self, x, adj):
        neigh = torch.sparse.mm(adj, x)
        h = torch.cat([x, neigh], dim=1)
        h = F.relu(self.fc1(h))
        h = F.dropout(h, p=self.dropout, training=self.training)

        neigh2 = torch.sparse.mm(adj, h)
        h2 = torch.cat([h, neigh2], dim=1)
        z = self.fc2(h2)
        return z


class GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.3, negative_slope=0.2):
        super().__init__()
        self.W = nn.Linear(in_dim, out_dim, bias=False)
        self.a = nn.Linear(2*out_dim, 1, bias=False)
        self.dropout = dropout
        self.negative_slope = negative_slope

    def forward(self, x, adj_coo):
        # adj_coo: torch sparse COO (indices)
        xW = self.W(x)
        idx = adj_coo.indices()
        src, dst = idx[0], idx[1]

        h_src = xW[src]
        h_dst = xW[dst]
        e = self.a(torch.cat([h_src, h_dst], dim=1)).squeeze()
        e = F.leaky_relu(e, negative_slope=self.negative_slope)

        # softmax por nodo destino
        # implementacion eficiente: usar scatter
        # exp(e - max) por dst
        max_per_dst = torch.full((x.size(0),), -1e9, device=x.device)
        max_per_dst = max_per_dst.scatter_reduce(0, dst, e, reduce='amax', include_self=True)
        exp_e = torch.exp(e - max_per_dst[dst])

        denom = torch.zeros((x.size(0),), device=x.device)
        denom = denom.scatter_add(0, dst, exp_e)
        alpha = exp_e / (denom[dst] + 1e-12)
        alpha = F.dropout(alpha, p=self.dropout, training=self.training)

        out = torch.zeros_like(xW)
        out = out.index_add(0, dst, alpha.unsqueeze(1) * h_src)
        return out


class GAT(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0.3):
        super().__init__()
        self.gat1 = GATLayer(in_dim, hid_dim, dropout=dropout)
        self.gat2 = GATLayer(hid_dim, out_dim, dropout=dropout)
        self.dropout = dropout

    def forward(self, x, adj):
        # adj debe ser sparse coo
        if not adj.is_coalesced():
            adj = adj.coalesce()
        h = self.gat1(x, adj)
        h = F.elu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)
        z = self.gat2(h, adj)
        return z


## 7) Decoder (Edge-level prediction head)

In [9]:
class EdgeMLPDecoder(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(2*emb_dim, emb_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(emb_dim, 1)
        )

    def forward(self, z, edges):
        zu = z[edges[:,0]]
        zv = z[edges[:,1]]
        return self.mlp(torch.cat([zu, zv], dim=1)).squeeze()


def lp_loss(z, pos_e, neg_e, decoder):
    pos_logits = decoder(z, pos_e)
    neg_logits = decoder(z, neg_e)
    logits = torch.cat([pos_logits, neg_logits])
    labels = torch.cat([
        torch.ones(pos_logits.shape[0], device=z.device),
        torch.zeros(neg_logits.shape[0], device=z.device)
    ])
    return F.binary_cross_entropy_with_logits(logits, labels)


def t2np(t: torch.Tensor) -> np.ndarray:
    return np.asarray(t.detach().cpu().tolist())

def lp_eval(z, pos_e, neg_e, decoder):
    decoder.eval()
    with torch.no_grad():
        ps_t = torch.sigmoid(decoder(z, pos_e))
        ns_t = torch.sigmoid(decoder(z, neg_e))

    ps = t2np(ps_t)
    ns = t2np(ns_t)

    y_true = np.concatenate([np.ones_like(ps), np.zeros_like(ns)])
    y_score = np.concatenate([ps, ns])

    return roc_auc_score(y_true, y_score), average_precision_score(y_true, y_score)


## 8) Entrenamiento + selección del mejor modelo

In [10]:

SAVE_DIR = Path(r"C:\Bdata3\Reto_10\Datos alumnos\Trasformados\Modelos")
SAVE_DIR.mkdir(parents=True, exist_ok=True)
print("Guardando en:", SAVE_DIR.resolve())



def train_one_model(model, decoder, X, A_t, epochs=350, lr=1e-3, weight_decay=1e-5, patience=25, log_every=10):
    model = model.to(device)
    decoder = decoder.to(device)

    opt = torch.optim.Adam(
        list(model.parameters()) + list(decoder.parameters()),
        lr=lr,
        weight_decay=weight_decay
    )

    best = {'val_auc': -1, 'val_ap': -1, 'state': None, 'epoch': 0}
    bad = 0
    epochs_run = 0

    for epoch in range(1, epochs + 1):
        epochs_run = epoch

        model.train(); decoder.train()
        z = model(X, A_t)
        loss = lp_loss(z, pos_train_t, neg_train_t, decoder)

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(model.parameters()) + list(decoder.parameters()), 1.0)
        opt.step()

        model.eval(); decoder.eval()
        with torch.no_grad():
            z_eval = model(X, A_t)

        val_auc, val_ap = lp_eval(z_eval, pos_val_t, neg_val_t, decoder)

        if val_auc > best['val_auc']:
            best['val_auc'] = val_auc
            best['val_ap'] = val_ap
            best['epoch'] = epoch
            best['state'] = {
                'model': {k: v.detach().cpu() for k, v in model.state_dict().items()},
                'decoder': {k: v.detach().cpu() for k, v in decoder.state_dict().items()}
            }
            bad = 0
        else:
            bad += 1

        if epoch == 1 or epoch % log_every == 0:
            print(f"epoch {epoch:03d}/{epochs} | loss={loss.item():.4f} | val_auc={val_auc:.4f} | val_ap={val_ap:.4f} | bad={bad}")
            sys.stdout.flush()

        if bad >= patience:
            print(f"Early stopping at epoch {epoch} (best at epoch {best['epoch']}, patience={patience})")
            sys.stdout.flush()
            break

    # restore best
    model.load_state_dict({k: v.to(device) for k, v in best['state']['model'].items()})
    decoder.load_state_dict({k: v.to(device) for k, v in best['state']['decoder'].items()})

    model.eval(); decoder.eval()
    with torch.no_grad():
        z_final = model(X, A_t)

    test_auc, test_ap = lp_eval(z_final, pos_test_t, neg_test_t, decoder)


    base = f"{model.__class__.__name__}"
    save_path = SAVE_DIR / f"{base}.pt"
    i = 1
    while save_path.exists():
        save_path = SAVE_DIR / f"{base}_{i}.pt"
        i += 1

    torch.save({
        "model_class": model.__class__.__name__,
        "decoder_class": decoder.__class__.__name__,
        "best_epoch": best["epoch"],
        "epochs_run": epochs_run,
        "val_auc": best["val_auc"],
        "val_ap": best["val_ap"],
        "test_auc": test_auc,
        "test_ap": test_ap,
        "model_state_dict": model.state_dict(),
        "decoder_state_dict": decoder.state_dict(),
        "in_dim": X.shape[1],
    }, save_path)

    return {
        'epochs_run': epochs_run,
        'best_epoch': best['epoch'],
        'val_auc': best['val_auc'],
        'val_ap': best['val_ap'],
        'test_auc': test_auc,
        'test_ap': test_ap,
        'model': model,
        'decoder': decoder,
        'z': z_final
    }

Guardando en: C:\Bdata3\Reto_10\Datos alumnos\Trasformados\Modelos


In [11]:
in_dim = X.shape[1]
hid_dim = 64
emb_dim = 64

candidates = {
    "GCN": lambda: (GCN(in_dim, hid_dim, emb_dim, dropout=0.3), EdgeMLPDecoder(emb_dim)),
    "GraphSAGE": lambda: (GraphSAGEMean(in_dim, hid_dim, emb_dim, dropout=0.3), EdgeMLPDecoder(emb_dim)),
    "GAT": lambda: (GAT(in_dim, hid_dim, emb_dim, dropout=0.3), EdgeMLPDecoder(emb_dim)),
}

results = {}

for name, factory in candidates.items():
    print(f"ENTRENANDO MODELO: {name}")

    m, d = factory()
    out = train_one_model(
        m, d, X, A_gcn_t,
        epochs=350,
        lr=1e-3,
        patience=30,
        log_every=10
    )

    results[name] = out
    print(
        f"[FIN {name}] ran={out['epochs_run']} best={out['best_epoch']} | "
        f"val_auc={out['val_auc']:.4f} val_ap={out['val_ap']:.4f} | "
        f"test_auc={out['test_auc']:.4f} test_ap={out['test_ap']:.4f}"
    )

best_name = max(results, key=lambda k: results[k]["val_auc"])
best_out = results[best_name]

print(f"EL MEJOR ES = {best_name}")
print(
    f"val_auc={best_out['val_auc']:.4f} val_ap={best_out['val_ap']:.4f} | "
    f"test_auc={best_out['test_auc']:.4f} test_ap={best_out['test_ap']:.4f}"
)


ENTRENANDO MODELO: GCN
epoch 001/350 | loss=0.6940 | val_auc=0.7864 | val_ap=0.8024 | bad=0
epoch 010/350 | loss=0.6842 | val_auc=0.8287 | val_ap=0.8391 | bad=0
epoch 020/350 | loss=0.6533 | val_auc=0.8249 | val_ap=0.8344 | bad=10
epoch 030/350 | loss=0.5947 | val_auc=0.8149 | val_ap=0.8178 | bad=20
epoch 040/350 | loss=0.5387 | val_auc=0.8227 | val_ap=0.8114 | bad=30
Early stopping at epoch 40 (best at epoch 10, patience=30)
[FIN GCN] ran=40 best=10 | val_auc=0.8287 val_ap=0.8391 | test_auc=0.8260 test_ap=0.8143
ENTRENANDO MODELO: GraphSAGE
epoch 001/350 | loss=0.6936 | val_auc=0.7891 | val_ap=0.7776 | bad=0
epoch 010/350 | loss=0.6612 | val_auc=0.8183 | val_ap=0.8240 | bad=6
epoch 020/350 | loss=0.5438 | val_auc=0.8157 | val_ap=0.8213 | bad=16
epoch 030/350 | loss=0.5061 | val_auc=0.8398 | val_ap=0.8281 | bad=0
epoch 040/350 | loss=0.4727 | val_auc=0.8724 | val_ap=0.8460 | bad=0
epoch 050/350 | loss=0.4347 | val_auc=0.8921 | val_ap=0.8582 | bad=0
epoch 060/350 | loss=0.4055 | val_auc

In [12]:
rows = []
for name, out in results.items():
    rows.append({
        "modelo": name,
        "best_epoch": out["best_epoch"],
        "val_auc": out["val_auc"],
        "val_ap": out["val_ap"],
        "test_auc": out["test_auc"],
        "test_ap": out["test_ap"],
        "epochs_run": out["epochs_run"],
    })

df_results = pd.DataFrame(rows).sort_values("val_auc", ascending=False).reset_index(drop=True)
df_results

Unnamed: 0,modelo,best_epoch,val_auc,val_ap,test_auc,test_ap,epochs_run
0,GraphSAGE,350,0.979613,0.97253,0.979146,0.972067,350
1,GAT,350,0.94604,0.919745,0.94562,0.919254,350
2,GCN,10,0.828719,0.839104,0.825995,0.814264,40


## 9) Embeddings finales (mejor modelo)

In [13]:
z_best = best_out['z']

print("z_best:", tuple(z_best.shape))
print("Embedding nodo 0 (10 vals):", z_best[0, :10].detach().cpu().tolist())


z_best: (2000, 64)
Embedding nodo 0 (10 vals): [-0.08812612295150757, -0.6273081302642822, -0.3905266523361206, -0.07452064752578735, 0.1502360701560974, 0.18449687957763672, 0.21944749355316162, 0.17413607239723206, -0.3040415644645691, -0.667189359664917]


In [14]:
def knn_neighbors_for_new_item(x_new_np, X_np_all, k=5):
    sims = cosine_similarity(x_new_np.reshape(1, -1), X_np_all)[0]
    return np.argsort(sims)[-k:]


def extend_adj_with_new_node(A_base, neighbors_idx):
    # A_base: csr (N,N) ; devuelve coo (N+1,N+1)
    A_coo = A_base.tocoo()
    n = A_base.shape[0]

    # edges (new <-> neighbors)
    rows_new = np.concatenate([neighbors_idx, np.full(len(neighbors_idx), n)])
    cols_new = np.concatenate([np.full(len(neighbors_idx), n), neighbors_idx])
    data_new = np.ones(len(rows_new), dtype=np.float32)

    rows = np.concatenate([A_coo.row, rows_new])
    cols = np.concatenate([A_coo.col, cols_new])
    data = np.concatenate([A_coo.data.astype(np.float32), data_new])

    return sp.coo_matrix((data, (rows, cols)), shape=(n+1, n+1))


def embed_new_item(model, X, A_train_base, x_new_np, k=5):
    X_all_np = np.asarray(X.detach().cpu().tolist(), dtype=np.float32)

    neigh = knn_neighbors_for_new_item(x_new_np, X_all_np, k=k)

    A_ext = extend_adj_with_new_node(A_train_base, neigh)
    A_ext = normalize_gcn(A_ext.tocsr())
    A_ext_t = to_torch_sparse(A_ext).to(device)

    x_new_t = torch.tensor(x_new_np, dtype=torch.float32, device=device)
    if x_new_t.dim() == 1:
        x_new_t = x_new_t.unsqueeze(0)

    X_ext = torch.cat([X, x_new_t], dim=0)

    model.eval()
    with torch.no_grad():
        z_ext = model(X_ext, A_ext_t)

    z_new = z_ext[-1]
    z_existing = z_ext[:-1]
    return z_new, z_existing, neigh


def score_new_item_links(decoder, z_new, z_existing, top_k=10):
    z_new_rep = z_new.unsqueeze(0).repeat(z_existing.shape[0], 1)
    pair = torch.cat([z_new_rep, z_existing], dim=1)

    decoder.eval()
    with torch.no_grad():
        scores = torch.sigmoid(decoder.mlp(pair)).squeeze()

    topk = torch.topk(scores, k=min(top_k, scores.numel()))
    idxs = topk.indices.detach().cpu().tolist()
    vals = topk.values.detach().cpu().tolist()
    return idxs, vals


In [15]:

# ------------------------
# EJEMPLO DE USO
# ------------------------

model_best = best_out["model"]
decoder_best = best_out["decoder"]

# Crea una "prenda nueva" tomando una fila existente y cambiando algún valor
new_item_row = nodes_df.iloc[0].copy()

x_new_np = preprocess.transform(pd.DataFrame([new_item_row[feature_cols]]) )
if hasattr(x_new_np, 'toarray'):
    x_new_np = x_new_np.toarray()
x_new_np = x_new_np.astype(np.float32).reshape(-1)

z_new, z_existing, neigh = embed_new_item(model_best, X, A_train, x_new_np, k=5)
idxs, vals = score_new_item_links(decoder_best, z_new, z_existing, top_k=10)

print('KNN vecinos usados (indices):', neigh)
print("Top-10 relaciones predichas para prenda nueva:")



new_item_id = "NEW_ITEM"
print("Nueva prenda id:", new_item_id)

# Top-5 vecinos más parecidos usados como anclaje (por features)
print("\nTop-5 vecinos más parecidos (KNN anclaje):")
for rank, i in enumerate(neigh[:5], start=1):
    print(rank, "node_id=", idx2id[int(i)])

# Recomendaciones finales por el modelo (link prediction)
print("\nTop-10 parecidos recomendados (node_id, score):")
for rank, (i, s) in enumerate(zip(idxs, vals), start=1):
    node_id = idx2id[int(i)]
    print(rank, "node_id=", node_id, "score=", float(s))



KNN vecinos usados (indices): [279 272 271 157   0]
Top-10 relaciones predichas para prenda nueva:
Nueva prenda id: NEW_ITEM

Top-5 vecinos más parecidos (KNN anclaje):
1 node_id= 902
2 node_id= 888
3 node_id= 887
4 node_id= 434
5 node_id= 3

Top-10 parecidos recomendados (node_id, score):
1 node_id= 224 score= 0.9413853883743286
2 node_id= 3347 score= 0.9382900595664978
3 node_id= 5300 score= 0.9368705749511719
4 node_id= 3727 score= 0.9362207055091858
5 node_id= 4003 score= 0.9352750182151794
6 node_id= 2933 score= 0.9337077736854553
7 node_id= 3451 score= 0.9330787062644958
8 node_id= 2348 score= 0.9326686263084412
9 node_id= 3165 score= 0.9323083758354187
10 node_id= 5195 score= 0.932103157043457


## 11)  Función lista: añadir prenda manualmente y sin nivel(opcional)

In [16]:
# 1) DEFINICIÓN DE PRENDA NUEVA

new_item = { 
    "node_id": "X",
    "color_name": "blue_dark",
    "product_name": "NEW T jeans ward",
    "season_code": 7,

    "adventurous": "two",
    "application": "work",
    "composition": "cotton",
    "cut": "contour_darts",
    "style": "classic",
    "weather": "warm_season",
    "nivel": "",            # vacío -> se infiere
    "print": "smooth",
    "weather_norm": "W",

    "risk_score": 2,
    "style_code": "CL",
    "cut_group": "A",
    "print_group": "B",
}


In [17]:
# =========================
# 2) INFERENCIA DE NIVEL
# =========================

lvl3 = ["jacket","coat","blazer","parka","trench","anorak","chaqueta","abrigo","cazadora","gabardina","trenca"]
lvl2 = ["tshirt","t shirt","t-shirt","tee","top","shirt","camiseta","remera","blusa",
        "sweater","knit","jumper","jersey","pullover","cardigan","sweatshirt","hoodie","sudadera"]
lvl1 = ["pant","pants","trouser","jeans","denim","short","shorts","skirt","falda","pantalon","pantalones",
        "dress","vestido","playsuit","jumpsuit","mono","overall"]

def infer_nivel(product_name: str):
    if product_name is None:
        return None
    txt = str(product_name).lower()
    txt = re.sub(r"[^a-z0-9áéíóúñü\s\-]", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()

    for kw in lvl3:
        if kw in txt:
            return 3
    for kw in lvl2:
        if kw in txt:
            return 2
    for kw in lvl1:
        if kw in txt:
            return 1
    return None

if not new_item.get("nivel") or str(new_item.get("nivel")).strip() == "":
    inferred = infer_nivel(new_item.get("product_name", ""))
    new_item["nivel"] = inferred if inferred is not None else 0

print("Nueva prenda:", new_item["node_id"])
print("Product name:", new_item["product_name"])
print("Nivel inferido:", new_item["nivel"])


Nueva prenda: X
Product name: NEW T jeans ward
Nivel inferido: 1


In [18]:
# =========================
# 3) PREPROCESADO (MISMO PIPELINE)
# =========================

# Asegurar valores seguros para el preprocess (NO cambia el pipeline)
safe_item = new_item.copy()
safe_item["Unnamed: 0"] = 0          # índice colado
safe_item["nivel"] = int(safe_item["nivel"])
safe_item["season_code"] = int(safe_item["season_code"])
safe_item["risk_score"] = float(safe_item["risk_score"])

new_item_df = pd.DataFrame([{col: safe_item.get(col, None) for col in feature_cols}])
display(new_item_df)

x_new_np = preprocess.transform(new_item_df)

if hasattr(x_new_np, "toarray"):
    x_new_np = x_new_np.toarray()

# float + NaN-safe
x_new_np = x_new_np.astype(np.float32)
x_new_np = np.nan_to_num(x_new_np, nan=0.0, posinf=0.0, neginf=0.0)

# mantener 2D: (1, n_features)
if x_new_np.ndim == 1:
    x_new_np = x_new_np.reshape(1, -1)

print("Shape x_new_np:", x_new_np.shape)


Unnamed: 0.1,Unnamed: 0,color_name,product_name,season_code,adventurous,application,composition,cut,style,weather,nivel,print,weather_norm,risk_score,style_code,cut_group,print_group
0,0,blue_dark,NEW T jeans ward,7,two,work,cotton,contour_darts,classic,warm_season,1,smooth,W,2.0,CL,A,B


Shape x_new_np: (1, 1256)


In [19]:
model_best = best_out["model"]
decoder_best = best_out["decoder"]
x_new_t = torch.tensor(x_new_np, dtype=torch.float32, device=device).unsqueeze(0)

z_new, z_existing, neigh = embed_new_item(
    model_best, X, A_train, x_new_np, k=5
)

print("\nEmbedding nueva prenda - shape:", z_new.shape)



Embedding nueva prenda - shape: torch.Size([64])


In [20]:
# =========================
# 5) VECINOS KNN USADOS COMO ANCLAJE
# =========================

print("\nKNN vecinos usados (indices):", neigh)

print("\nTop-5 vecinos más parecidos por features (KNN anclaje):")
for rank, i in enumerate(neigh[:5], start=1):
    print(rank, "node_id=", idx2id[int(i)])


idxs, vals = score_new_item_links(
    decoder_best, z_new, z_existing, top_k=10
)

print(f"\nTop-10 relaciones recomendadas POR EL MODELO PARA la nueva prenda {new_item['node_id']}:")

for rank, (i, s) in enumerate(zip(idxs, vals), start=1):
    node_id = idx2id[int(i)]
    pname = nodes_df.loc[nodes_df["node_id"] == node_id, "product_name"].iloc[0]
    print(
        f"{rank}. node_id={node_id} | score={float(s):.4f} | product_name={pname}")



KNN vecinos usados (indices): [438 423 398 164 127]

Top-5 vecinos más parecidos por features (KNN anclaje):
1 node_id= 1250
2 node_id= 1212
3 node_id= 1147
4 node_id= 465
5 node_id= 311

Top-10 relaciones recomendadas POR EL MODELO PARA la nueva prenda X:
1. node_id=2873 | score=0.9012 | product_name=Malou Jacket blazer
2. node_id=1123 | score=0.8990 | product_name=Collection Shirt chi 
3. node_id=2422 | score=0.8972 | product_name=Lace Tshirt mix 
4. node_id=3021 | score=0.8952 | product_name=Mina Top klein 
5. node_id=1244 | score=0.8950 | product_name=Darlim Top cerise 
6. node_id=125 | score=0.8948 | product_name=Ally Shirt bright 
7. node_id=1794 | score=0.8937 | product_name=Friche Shirt solid 
8. node_id=2743 | score=0.8935 | product_name=Luca Top cerise 
9. node_id=1864 | score=0.8911 | product_name=Gmsurimi Top cap 
10. node_id=3356 | score=0.8887 | product_name=Olina Shirt collect 
