# 基于引用的专利未来影响力研究

## Simple GNN

In [None]:
# import sys
# sys.path.append('..')

import pandas as pd
from datetime import datetime

import torch
from torch_geometric.data import Data

DATA = pd.read_csv("../Data/GT36.csv")

In [None]:
# Data.head()
# Data[Data.isnull().values == True]
# DATA = DATA.sort_values(by=['ISD'])

# eval(DATA['b_cits'][0])

pn_to_id = {}

for item in DATA.values:
    pn_to_id[item[1]] = item[0]


In [None]:
# GNN Representation
# 去掉PGPUB Document，eg: 2014/0240120

now = datetime.strptime("2022-06-21", "%Y-%M-%d").year
pn_ls = [item[1] for item in DATA.values]

n_nodes = len(DATA)
node_dim = 2        # CPC, IPC


Nodes_x = []        # CPC, IPC
Nodes_y = []        # label
Edges = []          # conectivity weights
Adj_Ls = []         # conectivity

for i in range(len(DATA)):
    pn = DATA['PN'][i]
    ref = DATA['REF'][i]
    b_cits = eval(DATA['b_cits'][i])
    isd = DATA['ISD'][i]

    # x
    node = [DATA['CPC'][i], DATA['IPC'][i]]
    Nodes_x.append(node)

    # y
    years = now - datetime.strptime(isd, "%Y-%M-%d").year
    score = ref/years       # Optimize it with python lambda
    if score > 1:
        node_y = 1          # positive
    else:
        node_y = 0          # negative
    Nodes_y.append(node_y)

    # conectivity
    if b_cits[0] == 'NULL' or b_cits[0] == '':
        continue
    for cp in b_cits:

        # 注意：cp 必须是已知节点，即在Nodes中 
        if cp.find('/') == -1:
            try:
                if int(cp) in pn_ls:
                    pn_id = pn_to_id[pn]
                    cp_id = pn_to_id[int(cp)]
                    Adj_Ls.append([pn_id, cp_id])
                    duration = (pn - int(cp))/1e7
                    Edges.append(duration)

            except Exception as ex:
                pass
                # print("{}采集出错，出错原因：{}".format(pn, ex))



In [None]:
# print(len(Adj_Ls), len(Nodes_x))
# 687 564

# Adj_Ls

### GNN

In [42]:
# num_nodes, num_node_features
x = torch.tensor(Nodes_x, dtype=torch.float)

# num_nodes, 1
y = torch.tensor(Nodes_y, dtype=torch.long)

# 2, num_edges
edge_index = torch.tensor(Adj_Ls, dtype=torch.long)

# num_edges, num_edge_features
edge_attr = torch.tensor(Edges, dtype=torch.float)

# train-test split
train_ls = [True]*300 + [False]*264
train_mask = torch.tensor(train_ls, dtype=bool)
test_ls = [False]*300 + [True]*264
test_mask = torch.tensor(test_ls, dtype=bool)


data = Data(x=x, y=y, edge_index = edge_index.t().contiguous(), edge_attr=edge_attr, train_mask = train_mask, test_mask=test_mask)

In [43]:
print(f'Number of nodes: {data.num_nodes}') # 节点数量
print(f'Number of edges: {data.num_edges}') # 边数量
print(f'Number of node features: {data.num_node_features}') # 节点属性的维度
print(f'Number of node features: {data.num_features}') # 同样是节点属性的维度
print(f'Number of edge features: {data.num_edge_features}') # 边属性的维度
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}') # 平均节点度
# print(f'if edge indices are ordered and do not contain duplicate entries.: {data.is_coalesced()}') # 是否边是有序的同时不含有重复的边
print(f'Number of training nodes: {data.train_mask.sum()}') # 用作训练集的节点
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}') # 用作训练集的节点的数量
print(f'Contains isolated nodes: {data.has_isolated_nodes()}') # 此图是否包含孤立的节点
print(f'Contains self-loops: {data.has_self_loops()}')  # 此图是否包含自环的边
print(f'Is undirected: {data.is_undirected()}')  # 此图是否是无向图

Number of nodes: 564
Number of edges: 687
Number of node features: 2
Number of node features: 2
Number of edge features: 1
Average node degree: 1.22
Number of training nodes: 300
Training node label rate: 0.53
Contains isolated nodes: True
Contains self-loops: False
Is undirected: False


In [None]:
# data.is_coalesced()
# 为何会有assertion error?
data

In [44]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(2, 16)
        self.conv2 = GCNConv(16, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [46]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.4432
