In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
from torch.utils.data import IterableDataset, DataLoader

from external_information import ExternalInformationFusionDTPC, ExternalInformationDense
from partial_information import CoordLSTM

In [2]:
class ParquetCityDDataset(IterableDataset):
    def __init__(
        self,
        parquet_path: str,
        city_list: list[str],
        chunk_size: int = 10_000
    ):
        """
        IterableDataset que lê um Parquet grande em chunks e filtra apenas as
        cidades em `city_list`. Retorna para cada amostra:
          uid, d, t, city_idx, poi_vector, coords_seq (1,2)
        """
        self.parquet_path = parquet_path
        self.city_list    = city_list
        self.city_set     = set(city_list)
        # mapeia cada letra para um índice 0..len(city_list)-1
        self.city_to_idx  = {c: i for i, c in enumerate(city_list)}
        self.chunk_size   = chunk_size

    def __iter__(self):
        pf = pq.ParquetFile(self.parquet_path)
        for batch in pf.iter_batches(batch_size=self.chunk_size):
            table = pa.Table.from_batches([batch], schema=pf.schema_arrow)

            # filtra apenas as cidades permitidas
            mask = pc.is_in(table.column("city"), pa.array(list(self.city_set)))
            table = table.filter(mask)
            if table.num_rows == 0:
                continue

            # extrai arrays
            uid_arr  = table.column("uid").to_numpy().astype(np.int64)
            d_arr    = table.column("d"  ).to_numpy().astype(np.int64)
            t_arr    = table.column("t"  ).to_numpy().astype(np.int64)
            poi_arr  = np.vstack(table.column("POI").to_pylist()).astype(np.float32)
            x_arr    = table.column("x").to_numpy().astype(np.float32)
            y_arr    = table.column("y").to_numpy().astype(np.float32)
            city_arr = table.column("city").to_pylist()
            city_idx = np.array([self.city_to_idx[c] for c in city_arr], dtype=np.int64)

            # para cada amostra yield
            for uid, d, t, cidx, poi, x, y in zip(
                uid_arr, d_arr, t_arr, city_idx, poi_arr, x_arr, y_arr
            ):
                coords_seq = np.array([[x, y]], dtype=np.float32)  # (1,2)
                yield (
                    torch.tensor(uid, dtype=torch.long),
                    torch.tensor(d,   dtype=torch.long),
                    torch.tensor(t,   dtype=torch.long),
                    torch.tensor(cidx, dtype=torch.long),
                    torch.from_numpy(poi),               # (85,)
                    torch.from_numpy(coords_seq)         # (1,2)
                )

## Fusão de dados

In [3]:
class WeightedFusion(nn.Module):
    """
    Funde dois vetores de mesmo tamanho (B, dim) por uma soma ponderada:
        output = w_r * static_red + w_e * dyn_emb
    onde w_r e w_e são parâmetros escalar aprendíveis.
    A saída tem a mesma dimensão (dim) dos vetores de entrada.
    """
    def __init__(self, dim: int = 20, init_w_r: float = 0.5, init_w_e: float = 0.5):
        super().__init__()
        # pesos escalares aprendíveis
        self.w_r = nn.Parameter(torch.tensor(init_w_r, dtype=torch.float32))
        self.w_e = nn.Parameter(torch.tensor(init_w_e, dtype=torch.float32))
        self.dim = dim

    def forward(self, static_red: torch.Tensor, dyn_emb: torch.Tensor) -> torch.Tensor:
        """
        static_red: Tensor[B, dim] – vetor reduzido estático
        dyn_emb:     Tensor[B, dim] – vetor reduzido dinâmico (LSTM)
        retorna:     Tensor[B, dim] – fusão ponderada
        """
        # checa que as dimensões batem
        assert static_red.shape == dyn_emb.shape and static_red.size(1) == self.dim
        # soma ponderada
        fused = self.w_r * static_red + self.w_e * dyn_emb
        return fused

In [4]:
n_users_by_city = {"A":100_000, "B":25_000, "C":20_000, "D":6_000}

In [5]:

parquet_file = "humob_all_cities_dpsk.parquet"
n_users_D    = n_users_by_city["D"]
# instantiate models
fusion = ExternalInformationFusionDTPC(
    n_users=n_users_D,
    n_days=75,
    n_slots=48,
    n_cities=4,
    emb_dim=10,
    poi_in_dim=85,
    poi_out_dim=10
)
dense = ExternalInformationDense(in_dim=fusion.out_dim, out_dim=20)
lstm  = CoordLSTM(input_size=2, hidden_size=10, bidirectional=True)

# create dataset + loader
ds = ParquetCityDDataset(parquet_file, city_list=["D"], chunk_size=2500)
loader = DataLoader(ds, batch_size=32, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
fusion.to(device); dense.to(device); lstm.to(device)

# get one batch and process
batch = next(iter(loader))
uid, d, t, city, poi, coords_seq = [b.to(device) for b in batch]
# static features reduction
static_emb = fusion(uid, d, t, city, poi)   # (32, emb*4+poi_out)
static_red = dense(static_emb)               # (32, 20)
# dynamic LSTM encoding
dyn_emb = lstm(coords_seq)                   # (32, 20)

print("static_red shape:", static_red.shape)
print("dyn_emb    shape:", dyn_emb.shape)

Using device: cuda
static_red shape: torch.Size([32, 20])
dyn_emb    shape: torch.Size([32, 20])


Os dois pesos passam a ser **parâmetros aprendíveis**, cada um sendo um tensor escalar (shape `()`) com `requires_grad=True`. Veja como eles ficam logo após instanciar:

Durante o `optimizer.step()`, tanto `w_r` quanto `w_e` recebem gradientes e são atualizados junto com todos os demais pesos da sua rede. No final do treinamento, seus valores representarão a “importância” relativa que o modelo atribuiu ao vetor estático (`static_red`) versus o vetor dinâmico (`dyn_emb`).

In [6]:
B, dim = 32, 20
# static_red = torch.rand(B, dim)
# dyn_emb    = torch.rand(B, dim)

fusion = WeightedFusion(dim=dim)
out = fusion(static_red, dyn_emb)
print("Fused shape:", out.shape)  # torch.Size([32, 20])
print("w_r =", fusion.w_r.item(), " w_e =", fusion.w_e.item())

Fused shape: torch.Size([32, 20])
w_r = 0.5  w_e = 0.5


In [7]:
# class DestinationHead(nn.Module):
#     """
#     Head final que,:
#       - pega um vetor de dimensão 20,
#       - projeta para C logits,
#       - softmax → P (B×C),
#       - retorna coords finais (B×2) = P @ centers.
#     """
#     def __init__(self, in_dim: int, cluster_centers: torch.Tensor):
#         """
#         in_dim: dimensão de entrada (20)
#         cluster_centers: tensor C×2 com as coordenadas dos centros
#         """
#         super().__init__()
#         C, coord_dim = cluster_centers.shape
#         assert coord_dim == 2
#         # linear que vai gerar C e_i's
#         self.fc = nn.Linear(in_dim, C, bias=True)
#         # inicializa pesos com os centros e bias=0
#         with torch.no_grad():
#             # fc.weight: [C, in_dim] — queremos usar cada centro (2-vector)
#             # mas nossos centros são 2-dim, não in_dim. 
#             # Em vez disso, faremos outra abordagem: projetar in_dim→C
#             # e armazenar centers separadamente:
#             self.centers = nn.Parameter(cluster_centers, requires_grad=False)
#             # bias em zero
#             self.fc.bias.zero_()

#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         """
#         x: (B, in_dim)
#         retorna: (B, 2)
#         """
#         logits = self.fc(x)               # (B, C)
#         P      = F.softmax(logits, dim=1) # (B, C)
#         # média ponderada dos centros: P @ centers -> (B,2)
#         coords = P @ self.centers         # (B,2)
#         return coords

In [14]:
class MLP500(nn.Module):
    """
    MLP simples com 1 hidden layer de 500 ReLUs.
     - in_dim → 500 → C logits
    """
    def __init__(self, in_dim: int, hidden_dim: int, n_clusters: int):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, n_clusters)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, in_dim)
        x = F.relu(self.fc1(x))  # (B, hidden_dim)
        return self.fc2(x)       # (B, n_clusters)


class DestinationHead(nn.Module):
    """
    Combina um MLP500 + softmax + weighted sum pelos cluster centers.
    """
    def __init__(self,
                 in_dim: int,           # deve ser 20
                 hidden_dim: int,       # 500
                 cluster_centers: torch.Tensor  # (C,2)
    ):
        super().__init__()
        C, coord_dim = cluster_centers.shape
        assert coord_dim == 2
        self.mlp500 = MLP500(in_dim, hidden_dim, C)
        # armazenamos centros como buffer (não aprensíveis)
        self.register_buffer("centers", cluster_centers)

    def forward(self, fused: torch.Tensor) -> torch.Tensor:
        """
        fused: (B, 20) → retorna coords (B,2)
        """
        logits = self.mlp500(fused)        # (B, C)
        P      = F.softmax(logits, dim=1)  # (B, C)
        # média ponderada: P @ centers → (B,2)
        coords = P @ self.centers
        return coords

In [12]:
import hdbscan  

def compute_hdbscan_centers(
    parquet_path: str,
    city_letter: str = "D",
    day_threshold: int = 60,
    chunk_size: int = 50_000,
    min_cluster_size: int = 100
) -> torch.Tensor:
    """
    1) Lê em chunks o Parquet,
    2) filtra city==city_letter e d<day_threshold,
    3) acumula destinos (x,y),
    4) roda HDBSCAN para descobrir clusters,
    5) retorna tensor K×2 com os centros (média dos pontos em cada cluster).
    """
    # 1) coleta todos os destinos de treino da cidade
    pf = pq.ParquetFile(parquet_path)
    coords_list = []
    for batch in pf.iter_batches(batch_size=chunk_size):
        tbl = pa.Table.from_batches([batch], schema=pf.schema_arrow)
        mask = pc.and_(
            pc.equal(tbl.column("city"), city_letter),
            pc.less(tbl.column("d"), day_threshold)
        )
        tbl = tbl.filter(mask)
        if tbl.num_rows == 0:
            continue
        xs = tbl.column("x").to_numpy()
        ys = tbl.column("y").to_numpy()
        coords_list.append(np.stack([xs, ys], axis=1))
    coords = np.vstack(coords_list)  # shape (N,2)

    # 2) (opcional) amostra para acelerar
    if len(coords) > 200_000:
        idx = np.random.choice(len(coords), 200_000, replace=False)
        sample = coords[idx]
    else:
        sample = coords

    # 3) roda HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric="euclidean",
        cluster_selection_method="eom"
    )
    labels = clusterer.fit_predict(sample)  # shape (M,)

    # 4) calcula centros como média de cada cluster
    unique_labels = [lab for lab in np.unique(labels) if lab >= 0]
    centers = []
    for lab in unique_labels:
        pts = sample[labels == lab]
        centers.append(pts.mean(axis=0))
    centers = np.vstack(centers)  # shape (K,2)

    return torch.from_numpy(centers.astype(np.float32))

In [15]:
cluster_centers = compute_hdbscan_centers(
    parquet_file,
    city_letter="D",
    day_threshold=60,
    chunk_size=50_000,
    min_cluster_size=200
).to(device)
print(f"Found {cluster_centers.shape[0]} clusters")

# 2) Instantiate head
head = DestinationHead(
    in_dim=20,
    hidden_dim=500,
    cluster_centers=cluster_centers
).to(device)

# 3) Example fused vector (e.g., output of WeightedFusion)
# Here `out` should be a tensor of shape (B,20)

# 4) Forward final coordinates
z = head(out)  # (32,2)
print("z shape:", z.shape)
print("Sample coordinates:", z[:5])



Found 111 clusters
z shape: torch.Size([32, 2])
Sample coordinates: tensor([[111.0320,  94.4655],
        [110.9680,  94.4453],
        [110.9077,  94.4409],
        [110.8467,  94.5158],
        [110.9619,  94.4962]], device='cuda:0', grad_fn=<SliceBackward0>)
