### TRY

In [1]:
!pip install requests beautifulsoup4 networkx numpy scipy torch torchvision torchaudio torch-geometric
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__.split('+')[0])")+.html


Looking in links: https://data.pyg.org/whl/torch-2.6.0+.html
Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-cluster
  Downloading torch_cluster-1.6.3.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-spline-conv
  Downloading torch_spline_conv-1.2.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter, torch-sparse, torch-cluster, torch-spline-conv
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl size=547368 sha256=0b2c0fef08829

In [8]:

# Import necessary libraries
import requests
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from bs4 import BeautifulSoup
from scipy.linalg import eigh
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool
from torch_scatter import scatter_add
from transformers import AutoTokenizer, AutoModel

# Step 1: Robust Dataset Creation (Example)
dataset = [
    {"name": "Pizza", "wikidata": "https://www.wikidata.org/wiki/Q177", "label": 0},
    {"name": "Xiaolongbao", "wikidata": "https://www.wikidata.org/wiki/Q10943", "label": 2},
    {"name": "Bauhaus Archive", "wikidata": "https://www.wikidata.org/wiki/Q811389", "label": 1},
]

# Step 2: Enhanced Feature Extraction

def robust_extract(soup, selector, default=None):
    element = soup.select_one(selector)
    return element.get_text(strip=True) if element else default

def extract_features(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    features = {}

    # Wikidata statements
    for stmt in soup.select('.wikibase-statementview'):
        prop_label = robust_extract(stmt, '.wikibase-statementview-property-label')
        value = robust_extract(stmt, '.wikibase-snakview-value')
        if prop_label and value:
            features[prop_label] = value

    # Wikipedia extraction
    enwiki_link = robust_extract(soup, 'a[title="English"]', None)
    if enwiki_link:
        wiki_response = requests.get(enwiki_link['href'])
        wiki_soup = BeautifulSoup(wiki_response.text, 'html.parser')

        # Infobox extraction
        for row in wiki_soup.select('.infobox tr'):
            key = robust_extract(row, 'th')
            val = robust_extract(row, 'td')
            if key and val:
                features[f"infobox_{key}"] = val

        # Category extraction
        categories = wiki_soup.select('#mw-normal-catlinks ul li')
        features['categories'] = [cat.get_text(strip=True) for cat in categories]

        # Article length
        features['article_length'] = len(wiki_soup.select_one('body').get_text(strip=True))

    return features

# Multilingual embeddings extraction (recommended improvement)
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model_emb = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def get_multilingual_embedding(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        embeddings = model_emb(**encoded).last_hidden_state.mean(dim=1)
    return embeddings.mean(dim=0).numpy()

# Step 3: Graph Construction with extracted features

def construct_graph(features):
    G = nx.Graph()
    node_mapping = {}  # maps node name (string) to numeric index (int)

    # Create numeric nodes
    for idx, (key, val) in enumerate(features.items()):
        G.add_node(idx, feature=val, original_name=key)
        node_mapping[key] = idx

    # Add edges between numeric nodes
    keys = list(features.keys())
    for i in range(len(keys) - 1):
        src_idx = node_mapping[keys[i]]
        dst_idx = node_mapping[keys[i + 1]]
        G.add_edge(src_idx, dst_idx)

    return G, node_mapping



# Step 4: Sheaf-based Positional Encodings
def sheaf_pe(G, d=4):
    A = nx.to_numpy_array(G)
    D = np.diag(np.sum(A, axis=1))
    L = D - A
    eigvals, eigvecs = eigh(L)
    return eigvecs[:, np.argsort(eigvals)[1:d+1]]

# Step 5: Advanced SheafConvLayer (provided by user)
class SheafConvLayer(nn.Module):
    def __init__(self, input_dim, output_dim, edge_index, step_size=1.0):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.edge_index = edge_index
        self.step_size = step_size
        self.linear = nn.Linear(input_dim, output_dim)
        self.sheaf_learner = nn.Linear(2 * input_dim, 1, bias=False)
        self.register_buffer('left_idx', None)
        self.register_buffer('right_idx', None)
        self._precompute_indices()

    def _precompute_indices(self):
        edge_index = self.edge_index.cpu().numpy()
        edge_dict = {(u, v): i for i, (u, v) in enumerate(zip(*edge_index))}
        left_idx, right_idx = [], []
        for i, (u, v) in enumerate(zip(*edge_index)):
            left_idx.append(i)
            rev_idx = edge_dict.get((v, u), i)
            right_idx.append(rev_idx)
        self.left_idx = torch.tensor(left_idx, dtype=torch.long)
        self.right_idx = torch.tensor(right_idx, dtype=torch.long)

    def predict_restriction_maps(self, x):
        row, col = self.edge_index
        x_row = x[row]
        x_col = x[col]
        maps = self.sheaf_learner(torch.cat([x_row, x_col], dim=1))
        return torch.tanh(maps)

    def build_laplacian(self, maps, num_nodes):
        row, col = self.edge_index.to(maps.device)
        left_maps = maps[self.left_idx]
        right_maps = maps[self.right_idx]
        non_diag = -left_maps * right_maps
        diag = scatter_add(maps ** 2, row, dim=0, dim_size=num_nodes)
        d_inv_sqrt = (diag + 1e-6).pow(-0.5)
        left_norm = d_inv_sqrt[row]
        right_norm = d_inv_sqrt[col]
        norm_vals = left_norm * non_diag * right_norm
        diag_vals = d_inv_sqrt * diag * d_inv_sqrt
        diag_indices = torch.arange(num_nodes, device=maps.device).unsqueeze(0).repeat(2, 1)
        all_indices = torch.cat([diag_indices, self.edge_index], dim=1)
        all_values = torch.cat([diag_vals.view(-1), norm_vals.view(-1)])
        return torch.sparse_coo_tensor(all_indices, all_values, size=(num_nodes, num_nodes))

    def forward(self, x):
        num_nodes = x.size(0)
        device = x.device
        self.edge_index = self.edge_index.to(device)
        self.left_idx = self.left_idx.to(device)
        self.right_idx = self.right_idx.to(device)
        maps = self.predict_restriction_maps(x)
        laplacian = self.build_laplacian(maps, num_nodes)
        return self.linear(x) - self.step_size * torch.sparse.mm(laplacian, self.linear(x))
# Step 6: SheafGNN Integration
class SheafGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, edge_index):
        super().__init__()
        self.conv1 = SheafConvLayer(input_dim, hidden_dim, edge_index)
        self.conv2 = SheafConvLayer(hidden_dim, hidden_dim, edge_index)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, batch):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = global_mean_pool(x, batch)
        return F.log_softmax(self.fc(x), dim=1)

# Step 7: Data Preparation Pipeline (corrected)
from torch_geometric.data import Data

data_list = []
for item in dataset:
    features = extract_features(item['wikidata'])
    graph, node_mapping = construct_graph(features)

    if graph.number_of_nodes() == 0:
        # Skip empty graph
        continue

    pe = sheaf_pe(graph)
    multilingual_emb = get_multilingual_embedding([item['name']])
    multilingual_emb_expanded = np.tile(multilingual_emb, (pe.shape[0], 1))

    x = torch.tensor(np.hstack([pe, multilingual_emb_expanded]), dtype=torch.float)

    edges = list(graph.edges)
    if len(edges) == 0:
        # Add self-loop to ensure edge_index has valid dimensions
        edges = [(0, 0)]

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    if edge_index.numel() == 0 or edge_index.dim() != 2 or edge_index.shape[0] != 2:
        print(f"⚠️ Skipping graph '{item['name']}' due to bad edge_index shape.")
        continue

    y = torch.tensor([item['label']], dtype=torch.long)
    batch = torch.zeros(x.size(0), dtype=torch.long)

    data = Data(x=x, edge_index=edge_index, y=y, batch=batch)
    data_list.append(data)





# Step 8: Training Example (brief)
loader = DataLoader(data_list, batch_size=2, shuffle=True)
model = SheafGNN(input_dim=x.shape[1], hidden_dim=16, output_dim=3, edge_index=edge_index)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(30):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data.x, data.batch)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}')


ValueError: num_samples should be a positive integer value, but got num_samples=0