In [1]:
!pip install dgl

Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cublas_cu12-12.1.3.1

In [2]:
from dgl.data.utils import load_graphs
import dgl
import torch as th
import dgl.nn as dglnn
import dgl.function as fn
import torch.nn as nn
from sklearn.metrics import roc_auc_score

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [3]:
G = load_graphs('/content/drive/MyDrive/Graph.bin')
G = G[0][0]
G

Graph(num_nodes={'company': 1485, 'okved': 1491},
      num_edges={('company', 'dop_okved', 'okved'): 31667, ('company', 'main_okved', 'okved'): 1485, ('okved', 'dop_okved_for', 'company'): 31667, ('okved', 'parent', 'okved'): 1407},
      metagraph=[('company', 'okved', 'dop_okved'), ('company', 'okved', 'main_okved'), ('okved', 'company', 'dop_okved_for'), ('okved', 'okved', 'parent')])

In [None]:
G.num_nodes(), G.num_edges()

(2976, 66226)

In [56]:
mask = (th.rand(31667) > 0.8)
G.edges['dop_okved'].data['mask'] = mask
G.edges['dop_okved_for'].data['mask'] = mask

In [68]:
train_edge_dict = {etype: (G.edges[etype].data['mask'] == 0).nonzero(as_tuple=True)[0] for etype in ['dop_okved', 'dop_okved_for']}
test_edge_dict   = {etype: (G.edges[etype].data['mask'] == 1).nonzero(as_tuple=True)[0] for etype in ['dop_okved', 'dop_okved_for']}

In [72]:
G_train = load_graphs('/content/drive/MyDrive/Graph.bin')
G_train = G_train[0][0]

for etype in ['dop_okved', 'dop_okved_for']:
  G_train = dgl.remove_edges(G_train, etype=etype, eids=test_edge_dict[etype])

In [74]:
G_test = load_graphs('/content/drive/MyDrive/Graph.bin')
G_test = G_test[0][0]

for etype in ['dop_okved', 'dop_okved_for']:
  G_test = dgl.remove_edges(G_test, etype=etype, eids=train_edge_dict[etype])

In [73]:
G_train

Graph(num_nodes={'company': 1485, 'okved': 1491},
      num_edges={('company', 'dop_okved', 'okved'): 25229, ('company', 'main_okved', 'okved'): 1485, ('okved', 'dop_okved_for', 'company'): 25229, ('okved', 'parent', 'okved'): 1407},
      metagraph=[('company', 'okved', 'dop_okved'), ('company', 'okved', 'main_okved'), ('okved', 'company', 'dop_okved_for'), ('okved', 'okved', 'parent')])

In [75]:
G_test

Graph(num_nodes={'company': 1485, 'okved': 1491},
      num_edges={('company', 'dop_okved', 'okved'): 6438, ('company', 'main_okved', 'okved'): 1485, ('okved', 'dop_okved_for', 'company'): 6438, ('okved', 'parent', 'okved'): 1407},
      metagraph=[('company', 'okved', 'dop_okved'), ('company', 'okved', 'main_okved'), ('okved', 'company', 'dop_okved_for'), ('okved', 'okved', 'parent')])

In [76]:
class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']

In [77]:
def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = th.randint(0, graph.num_nodes(vtype), (len(src) * k,))
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

In [89]:
def compute_loss(pos_score, neg_score):
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

def compute_auc(pos_score, neg_score):
    scores = th.cat([pos_score, neg_score]).numpy()
    labels = th.cat(
        [th.ones(pos_score.shape[0]), th.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

### Модель 1

In [87]:
class RGCN1(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()
        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')
        self.fc = nn.Linear(in_features=772, out_features=768)

    def forward(self, graph, inputs):
        inputs_current = inputs.copy()
        inputs_current['company'] = self.fc(inputs_current['company'])
        h = self.conv1(graph, inputs_current)
        h = {k: v.relu() for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [88]:
class Model1(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN1(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [81]:
columns = ['is_social', 'licenses', 'workers', 'category']
dop_train = G_train.nodes['company'].data['address']
dop_test = G_test.nodes['company'].data['address']

for v in columns:
  dop_train = th.hstack((dop_train, G_train.nodes['company'].data[v].reshape(-1, 1)))
  dop_test = th.hstack((dop_test, G_test.nodes['company'].data[v].reshape(-1, 1)))

company_feats_train = dop_train.float()
company_feats_test = dop_test.float()
okved_feats_train = G_train.nodes['okved'].data['embeddings'].float()
okved_feats_test = G_test.nodes['okved'].data['embeddings'].float()


node_features_train = {'company': company_feats_train, 'okved': okved_feats_train}
node_features_test = {'company': company_feats_test, 'okved': okved_feats_test}

In [91]:
k = 5
model1 = Model1(768, 50, 16, G_train.etypes)

lr = 3e-4
opt = th.optim.Adam(model1.parameters(), lr=lr)

for epoch in range(100):
    negative_graph_train = construct_negative_graph(G_train, k, ('company', 'dop_okved', 'okved'))
    pos_score_train, neg_score_train = model1(G_train, negative_graph_train, node_features_train, ('company', 'dop_okved', 'okved'))
    loss = compute_loss(pos_score_train, neg_score_train)

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 10 == 0:
      print(f'{epoch=}, {loss.item()=}')

with th.no_grad():
    negative_graph_test = construct_negative_graph(G_test, k, ('company', 'dop_okved', 'okved'))
    pos_score_test, neg_score_test = model1(G_test, negative_graph_test, node_features_test, ('company', 'dop_okved', 'okved'))

    print("AUC_test", compute_auc(pos_score_test, neg_score_test))

epoch=0, loss.item()=1.4759693145751953
epoch=10, loss.item()=0.4067320227622986
epoch=20, loss.item()=0.3682614266872406
epoch=30, loss.item()=0.35735753178596497
epoch=40, loss.item()=0.3535016179084778
epoch=50, loss.item()=0.34343522787094116
epoch=60, loss.item()=0.3396366834640503
epoch=70, loss.item()=0.3288930058479309
epoch=80, loss.item()=0.323442280292511
epoch=90, loss.item()=0.314399778842926
AUC_test 0.8526661435031457


In [111]:
th.save(model1, '/content/drive/MyDrive/model1')

### Модель 2

In [115]:
class RGCN2(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()
        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.SAGEConv(in_feats, hid_feats, aggregator_type='mean')
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.SAGEConv(hid_feats, out_feats, aggregator_type='mean')
            for rel in rel_names}, aggregate='sum')
        self.fc = nn.Linear(in_features=772, out_features=768)

    def forward(self, graph, inputs):
        inputs_current = inputs.copy()
        inputs_current['company'] = self.fc(inputs_current['company'])
        h = self.conv1(graph, inputs_current)
        h = {k: v.relu() for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [116]:
class Model2(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN2(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [117]:
k = 5
model2 = Model2(768, 50, 16, G_train.etypes)

lr = 3e-4
opt = th.optim.Adam(model2.parameters(), lr=lr)

for epoch in range(100):
    negative_graph_train = construct_negative_graph(G_train, k, ('company', 'dop_okved', 'okved'))
    pos_score_train, neg_score_train = model2(G_train, negative_graph_train, node_features_train, ('company', 'dop_okved', 'okved'))
    loss = compute_loss(pos_score_train, neg_score_train)

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 10 == 0:
      print(f'{epoch=}, {loss.item()=}')

with th.no_grad():
    negative_graph_test = construct_negative_graph(G_test, k, ('company', 'dop_okved', 'okved'))
    pos_score_test, neg_score_test = model2(G_test, negative_graph_test, node_features_test, ('company', 'dop_okved', 'okved'))

    print("AUC_test", compute_auc(pos_score_test, neg_score_test))

epoch=0, loss.item()=25.688762664794922
epoch=10, loss.item()=2.5810346603393555
epoch=20, loss.item()=1.2477132081985474
epoch=30, loss.item()=0.8050557971000671
epoch=40, loss.item()=0.6427587270736694
epoch=50, loss.item()=0.5568554401397705
epoch=60, loss.item()=0.5101040601730347
epoch=70, loss.item()=0.4730355143547058
epoch=80, loss.item()=0.44464942812919617
epoch=90, loss.item()=0.4236791133880615
AUC_test 0.8270885042898733


In [118]:
th.save(model2, '/content/drive/MyDrive/model2')

### Модель 3

In [130]:
class RGCN3(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()
        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GATConv(in_feats, hid_feats, num_heads=3)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GATConv(hid_feats, out_feats, num_heads=3)
            for rel in rel_names}, aggregate='sum')
        self.fc = nn.Linear(in_features=772, out_features=768)

    def forward(self, graph, inputs):
        inputs_current = inputs.copy()
        inputs_current['company'] = self.fc(inputs_current['company'])
        h = self.conv1(graph, inputs_current)
        h = {k: v.relu() for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [131]:
class Model3(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN3(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [132]:
k = 5
model3 = Model3(768, 50, 16, G_train.etypes)

lr = 3e-4
opt = th.optim.Adam(model3.parameters(), lr=lr)

for epoch in range(100):
    negative_graph_train = construct_negative_graph(G_train, k, ('company', 'dop_okved', 'okved'))
    pos_score_train, neg_score_train = model3(G_train, negative_graph_train, node_features_train, ('company', 'dop_okved', 'okved'))
    loss = compute_loss(pos_score_train.mean(dim=[1, 2]), neg_score_train.mean(dim=[1, 2]))

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 10 == 0:
      print(f'{epoch=}, {loss.item()=}')

with th.no_grad():
    negative_graph_test = construct_negative_graph(G_test, k, ('company', 'dop_okved', 'okved'))
    pos_score_test, neg_score_test = model3(G_test, negative_graph_test, node_features_test, ('company', 'dop_okved', 'okved'))

    print("AUC_test", compute_auc(pos_score_test.mean(dim=[1, 2]), neg_score_test.mean(dim=[1, 2])))

epoch=0, loss.item()=17.956899642944336
epoch=10, loss.item()=1.142255425453186
epoch=20, loss.item()=0.7408223152160645
epoch=30, loss.item()=0.5653435587882996
epoch=40, loss.item()=0.41331425309181213
epoch=50, loss.item()=0.35496124625205994
epoch=60, loss.item()=0.3170761466026306
epoch=70, loss.item()=0.2923521399497986
epoch=80, loss.item()=0.2741198241710663
epoch=90, loss.item()=0.26080039143562317
AUC_test 0.8579836287745148


In [133]:
th.save(model3, '/content/drive/MyDrive/model3')