<a href="https://colab.research.google.com/github/Zheng-Ao/Colab-Notebooks/blob/main/P0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 全局设置

In [7]:
'''常用库导入'''
import random
random.seed(85)
from tqdm import tqdm
import numpy as np
import pandas as pd
raw_data_path = "drive/MyDrive/P0/B60-2010-9.csv"
# 测试谷歌云端硬盘是否成功加载:
# pd.read_csv(raw_data_path)

from sklearn import metrics

import torch
print(torch.__version__)                                                        # 用于确定PyG的安装
from torch import nn
from torch.utils.data import Dataset, Subset, ConcatDataset, DataLoader
from torch.nn import functional as F

# 检查GPU:
!nvidia-smi
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

1.12.0+cu113
Wed Aug 10 13:38:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    11W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------------------------------------------

In [2]:
%%capture
'''特殊库安装(魔法函数要放在Cell的最开始)'''
# 我不想输出安装信息，虽然%%capture不是用来干这个的，但它能达到我想要的效果
!pip install minetorch
!pip install datasets
!pip install transformers
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-geometric

# 数据探索，预处理，特征工程

## 原始数据->分析数据

In [8]:
raw_data = pd.read_csv(raw_data_path, index_col="Unnamed: 0")                   # 去掉自带的Unnamed: 0的index列

# 将3个字符串变成数值
for str in ["inventors","assignees","IPCs"]:
    for i in range(len(raw_data[str])):
        if str == "assignees":
            if raw_data[str].values[i] == "[{'assignee_sequence': None, 'assignee_key_id': None}]":
                raw_data[str].values[i] = 0
                continue
        raw_data[str].values[i] = len(eval(raw_data[str].values[i]))

# 上述操作虽然将值改了，但dtype仍然是object
for str in ["inventors","assignees","IPCs"]:
    raw_data[str] = raw_data[str].astype(int)

# raw_data.info()
# raw_data.describe()

## 数据清洗

In [9]:
'''本次清洗原则：低被引，皆可删
注意，清洗之后要重排id列，这是建图的必须！
默认的索引也要重新改！否则无法正常的进行索引操作！比如原来index=2800的被删掉了，那么df.at[2800,"someattr"]就会报错！！！
'''

# raw_data.info()
# 发现"patent_abstract"一列有缺失值，定位看看:
raw_data[raw_data.isnull().values == True]
# 如果无关紧要的负样本，直接删掉:
raw_data.dropna(inplace = True)
raw_data.reset_index(drop=True, inplace=True)
# raw_data.info()
# raw_data.describe()
# 发现后向引用数的min为0，存在没有后向引用的专利吗？定位看看:
# raw_data[raw_data["patent_num_us_patent_citations"].values == 0]
# 这些专利并非后向引用为0，而是引用了外国的专利，而非美国的专利。
# raw_data[raw_data["patent_num_us_patent_citations"].values == 1]
# 这些专利还是有一定被引数的，因此保留下来
num_patents = len(raw_data)
raw_data.insert(0, 'index', range(num_patents), allow_duplicates=False)         # 提供id列，方便建图

print(f"最终有{num_patents}个专利")

最终有10000个专利


In [10]:
# raw_data

## 看一看正负样本分别长什么样

In [11]:
raw_data.nlargest(10,"patent_num_cited_by_us_patents")
# pos_some_data = raw_data[raw_data["patent_num_cited_by_us_patents"]>5]
# pos_some_data.info()
# pos_some_data.describe()

Unnamed: 0,index,patent_number,patent_date,patent_num_cited_by_us_patents,patent_title,patent_abstract,patent_num_claims,patent_num_us_patent_citations,inventors,assignees,IPCs,cited_patents
77,77,10086782,2018-10-02,84,Autonomous vehicle damage and salvage assessment,"Methods and systems for assessing, detecting, ...",20,157,4,1,5,"[{'cited_patent_number': '4218763', 'cited_pat..."
3659,3659,10157423,2018-12-18,61,Autonomous vehicle operating style and mode mo...,"Methods and systems for monitoring use, determ...",20,229,16,1,5,"[{'cited_patent_number': '4218763', 'cited_pat..."
4281,4281,10168703,2019-01-01,61,Autonomous vehicle component malfunction impac...,"Methods and systems for assessing, detecting, ...",20,158,4,1,5,"[{'cited_patent_number': '4218763', 'cited_pat..."
1042,1042,10106083,2018-10-23,60,Vehicular warnings based upon pedestrian or cy...,Systems and methods are described for providin...,20,194,10,1,1,"[{'cited_patent_number': '4218763', 'cited_pat..."
4218,4218,10166994,2019-01-01,57,Autonomous vehicle operating status assessment,"Methods and systems for monitoring use, determ...",17,223,16,1,3,"[{'cited_patent_number': '4218763', 'cited_pat..."
2401,2401,10131347,2018-11-20,40,Parking assistance apparatus and vehicle havin...,"A parking assistance apparatus for a vehicle, ...",20,6,2,1,15,"[{'cited_patent_number': '8150593', 'cited_pat..."
3711,3711,10160278,2018-12-25,39,System and method for vehicle stabilization,A vehicle stabilization system including a fra...,20,62,3,1,6,"[{'cited_patent_number': '3766935', 'cited_pat..."
2028,2028,10124754,2018-11-13,38,Wireless charging and powering of electronic s...,Configurations and methods of wireless power t...,17,206,2,1,4,"[{'cited_patent_number': '787412', 'cited_pate..."
5708,5708,10196258,2019-02-05,36,Method and system for mobile distribution station,A system includes one or more mobile fuel dist...,20,59,2,1,12,"[{'cited_patent_number': '2340070', 'cited_pat..."
8042,8042,10246055,2019-04-02,35,Method for authorizing a driver to activate at...,A method for authorizing a driver to activate ...,20,3,1,1,5,"[{'cited_patent_number': '8451105', 'cited_pat..."


**可以看到基本上没什么规律，除非能够很好地理解文本，否则很难自动地将它们挑出来。**

## 分析数据->输入特征

### 数值特征归一化

In [12]:
for str in ["patent_num_claims","patent_num_us_patent_citations","inventors","assignees","IPCs"]:
    raw_data[str] = (raw_data[str] - raw_data[str].mean())/raw_data[str].std()

In [13]:
# raw_data

### 文本特征提取

In [14]:
'''
全部是uncased版本。
用最简单的DistillBert进行实验。
SCIBERT竟然是推理最慢的那一个，并且效果甚至比Distill还弱一些。
'''
# from transformers import DistilBertTokenizer, DistilBertModel
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# nlp_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

# from transformers import AutoTokenizer,AutoModel
# checkpoint = "anferico/bert-for-patents"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# nlp_model = AutoModel.from_pretrained(checkpoint).to(device)

from transformers import AutoTokenizer,AutoModel
checkpoint = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
nlp_model = AutoModel.from_pretrained(checkpoint).to(device)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
for i in tqdm(range(num_patents)):
    ttl = raw_data["patent_title"].values[i]
    abst = raw_data["patent_abstract"].values[i]
    txt_input = ttl + " " + abst
    # txt_input = ttl
    inputs = tokenizer(txt_input, return_tensors="pt").to(device)
    if len(inputs['input_ids'][0]) > 512:
        # 取前128和后382个token作为输入，详见Bert输入
        head_input = inputs['input_ids'][0, :129]
        tail_input = inputs['input_ids'][0, -383:]
        head_mask = inputs['attention_mask'][0, :129]
        tail_mask = inputs['attention_mask'][0, -383:]
        input_ids = torch.cat([head_input, tail_input]).unsqueeze(0)
        attention_mask = torch.cat([head_mask, tail_mask]).unsqueeze(0)
        outputs = nlp_model(input_ids = input_ids, attention_mask = attention_mask)
    else:
        # inputs是一个字典，因此要用**kwargs的形式
        outputs = nlp_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    # 详见Bert模型介绍
    cls_vec = last_hidden_states[:,0,:].clone().detach()
    if i == 0:
        txt_vecs = cls_vec
    else:
        txt_vecs = torch.cat([txt_vecs, cls_vec],dim=0)

print(txt_vecs.shape)
# should be [num_samples, embedding_dim]

100%|██████████| 10000/10000 [02:22<00:00, 70.21it/s]

torch.Size([10000, 768])





In [16]:
Mat = txt_vecs.cpu().numpy()

from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

# txt_pca = PCA(n_components=0.9, svd_solver = 'full')
txt_pca = PCA(n_components = 5)

In [17]:
# 查看PCA的效果:
txt_pca.fit(Mat)
var = txt_pca.explained_variance_ratio_
print(f"PCA之后的维度：{len(var)}，PCA后的方差保留：{var.sum():.4f}")

PCA之后的维度：5，PCA后的方差保留：0.3429


In [18]:
# PCA
Txt_Embedding_c = torch.tensor(txt_pca.fit_transform(Mat), dtype = torch.float32)
print(Txt_Embedding_c.shape)

# without PCA
Txt_Embedding = txt_vecs.clone()
print(Txt_Embedding.shape, Txt_Embedding.dtype)

torch.Size([10000, 5])
torch.Size([10000, 768]) torch.float32


### 图特征提取

In [19]:
# edge_index（邻接列表）, edge_attr（边特征）
from datetime import datetime

adj_ls = []
edges = []

'''这里有两个for循环，O(N*L)，非常贵！暂时没有其它手法，因此暂时不要处理10K级别以上的数据。'''
for i in tqdm(range(num_patents)):
    id = raw_data.values[i][0]
    b_cits = eval(raw_data["cited_patents"].values[i])
    for b_cit in b_cits:
        # 是否在专利数据库中（b_cit["cited_patent_date"] != None）
        if b_cit["cited_patent_date"]:
            # 是否在本数据集中
            if b_cit["cited_patent_number"] in raw_data["patent_number"].values:
                cit_id = raw_data[raw_data["patent_number"].values == b_cit["cited_patent_number"]].values[0][0]
                adj_ls.append([id, cit_id])
                date = raw_data["patent_date"].values[id]
                cit_date = raw_data["patent_date"].values[cit_id]
                date1 = datetime.strptime(date, "%Y-%m-%d")
                date2 = datetime.strptime(cit_date, "%Y-%m-%d")
                dist = date1-date2
                '''边的权重设置为365/间隔天数'''
                dist = 365/dist.days
                edges.append(dist)

100%|██████████| 10000/10000 [00:52<00:00, 189.60it/s]


In [20]:
# adj_ls, edges
# '''edges需要做归一化吗？'''

# UTILS

## Dataset

In [21]:
from datetime import datetime

# 0:neg, 1:pos
def LabelCorePa(ref, isd):
    now = datetime.strptime("2022-01-01", "%Y-%m-%d").year
    years = now - datetime.strptime(isd, "%Y-%m-%d").year
    score = ref/years       
    label = int((score>0.5))
    return label

class PatDataset(Dataset):
    def __init__(self, raw_data, txt_vecs = None, transform = None, target_transform = LabelCorePa):
        super().__init__()
        self.raw_data = raw_data
        self.txt_vecs = txt_vecs
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        # Y          
        ref = self.raw_data["patent_num_cited_by_us_patents"].values[idx]
        isd = self.raw_data.at[idx, "patent_date"]
        if self.target_transform:
            label = self.target_transform(ref, isd)
            label = torch.tensor(label,dtype=torch.long)

        # X
        # INDEXs
        num_claims = self.raw_data.at[idx, "patent_num_claims"]
        num_b_cits = self.raw_data.at[idx, "patent_num_us_patent_citations"]
        inventors = self.raw_data.at[idx, "inventors"]
        assignees = self.raw_data.at[idx, "assignees"]
        IPCs = self.raw_data.at[idx, "IPCs"]
        indexs = torch.tensor([num_claims, num_b_cits, inventors, assignees, IPCs], dtype=torch.float32).to(device)
        patent = indexs.clone()
        # TXT
        if self.txt_vecs != None:
            txt_vec = self.txt_vecs[idx].clone()
            txt_vec = txt_vec.to(device)
            patent = torch.cat([indexs, txt_vec])
        
        return patent, label

## Modules

In [22]:
def TowerBlock(input_size, output_size):
    block = nn.Sequential(
        nn.Linear(input_size, 128),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(128, 32),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(32, 8),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(8, output_size)
    )
    return block

class MLPNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            # nn.BatchNorm1d(input_size),
            # 不加BatchNorm效果更好，事实证明：不要去玩自己不会的东西
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            TowerBlock(hidden_size, output_size)
            # nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

from torch_geometric.nn import GCNConv

class GNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.conv1 = GCNConv(input_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, output_size)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index,edge_weight=edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        logits = self.conv2(x, edge_index,edge_weight=edge_weight)                   
        return logits        


# class PatNet(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super().__init__()
    

## Train-Test Loop

In [23]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # batch: 第几个batch；X: 包含batch_size个feature vec.
        X = X.to(device)
        y = y.to(device)
        output = model(X).to(device)
        loss = loss_fn(output, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            output = model(X)
            # torch crossentropy loss是output在前，target在后
            test_loss += loss_fn(output, y).item()
            pred = output.argmax(1).cpu()
            y = y.cpu()
            if batch == 0:
                Pred = pred
                Y = y
            else:
                Pred = torch.cat((Pred, pred), dim = 0)
                Y = torch.cat((Y, y), dim=0)

    avg_loss = test_loss/num_batches
    print(f"avg_loss:{avg_loss:.4f}")
    Performance(Y, Pred)

def Performance(y_true, y_pred):
    # y_true在前，y_pred在后
    C_Mat = metrics.confusion_matrix(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    print(C_Mat)
    print(f"acc:{accuracy:.4f}, f1:{f1:.4f}, recall:{recall:.4f}, prec:{precision:.4f}")
    

# CONFIG

In [24]:
# HyperParams----Config.py
hidden_size = 256
output_size = 2

learning_rate = 1e-3
weight_decay = 5e-4
batch_size = 64
epochs = 50
'''epochs取小不取大，就目前来说50完全够了。'''
num_epochs_to_print = epochs/10                                                 # 每隔10次输出一次Metrics

num_train = 8000
num_test = num_patents - num_train

# Main

In [25]:
dataset = PatDataset(raw_data=raw_data, txt_vecs = Txt_Embedding)
training_indices = [i for i in range(num_train)]
test_indices = [i for i in range(num_train,num_train+num_test)]
training_data_all = Subset(dataset, training_indices)
test_data = Subset(dataset, test_indices)

# Make pos:neg in training set 1:1
pos_indices = []
neg_indices = []
for i in tqdm(range(num_train)):
    if training_data_all[i][1] == 1:
        pos_indices.append(i)
    else:
        neg_indices.append(i)
num_pos = len(pos_indices)
num_neg = len(neg_indices)
print(f"NEG:POS = {num_neg}:{num_pos} = {num_neg/num_pos:.2f}")
neg_indices_sample = np.random.choice(neg_indices, num_pos, replace = False)	 
training_data_pos = Subset(training_data_all, pos_indices)
training_data_neg = Subset(training_data_all, neg_indices_sample)
training_data = ConcatDataset([training_data_pos, training_data_neg])
print("Training sample number:",len(training_data))

train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

100%|██████████| 8000/8000 [00:03<00:00, 2426.97it/s]

NEG:POS = 6888:1112 = 6.19
Training sample number: 2224
Feature batch shape: torch.Size([64, 773])
Labels batch shape: torch.Size([64])





## MLP

In [26]:
# PyTorch的逻辑是先初始化（喂超参），再进行函数计算（喂输入）

input_size = len(dataset[0][0])
MLP = MLPNet(input_size, hidden_size, output_size).to(device)
loss_fn = nn.CrossEntropyLoss()
"""nn.CrossEntropyLoss()会自动对loss进行batch内部的平均，即一个batch的总loss/batch_size。
即输出的不是总loss，而是平均loss！"""
optimizer = torch.optim.Adam(MLP.parameters(), lr=learning_rate, weight_decay=weight_decay)

# for t in range(epochs):
#     train_loop(train_dataloader, MLP, loss_fn, optimizer)
#     if t==0 or (t+1)%num_epochs_to_print == 0:
#         print(f"Epoch {t+1}\n-------------------------------")
#         test_loop(train_dataloader, MLP, loss_fn)                               
#         test_loop(test_dataloader, MLP, loss_fn)                                
# print("Done!")



In [27]:
from minetorch.miner import Miner

trainer = Miner(
    in_notebook=True,alchemistic_directory='./runs',train_dataloader=train_dataloader,val_dataloader=test_dataloader,
    model=MLP,
    loss_func=loss_fn,optimizer=optimizer,
    max_epochs=epochs,
    resume=False
    # eval_stride=num_epochs_to_print,drawer='tensorboard'
    )

In [28]:
trainer.train()

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

## PyG GNN

In [32]:
# data.x, data.y
for i in range(num_patents):
    if i == 0:
        x = dataset[i][0].unsqueeze(0)
        y = dataset[i][1].unsqueeze(0)
    else:
        x = torch.cat([x, dataset[i][0].unsqueeze(0)])
        y = torch.cat([y, dataset[i][1].unsqueeze(0)])

print(x.shape, x.dtype, y.shape, y.dtype)

torch.Size([10000, 773]) torch.float32 torch.Size([10000]) torch.int64


In [33]:
print(len(pos_indices),len(neg_indices_sample))
print(neg_indices_sample)

1112 1112
[ 253 4527  185 ... 7614 1622  415]


In [34]:
from torch_geometric.data import Data

# num_nodes, num_node_features
x = torch.tensor(x, dtype=torch.float32)

# num_nodes, 1
y = torch.tensor(y, dtype=torch.long)

# 2, num_edges
edge_index = torch.tensor(adj_ls, dtype=torch.long)

# num_edges, num_edge_features
edge_attr = torch.tensor(edges, dtype=torch.float32)

# train-test split
train_ls = [False]*num_train + [False]*num_test
train_mask = torch.tensor(train_ls, dtype=bool)
train_mask[pos_indices] = True
train_mask[neg_indices_sample] = True
test_ls = [False]*num_train + [True]*num_test
test_mask = torch.tensor(test_ls, dtype=bool)


data = Data(x=x, y=y, edge_index = edge_index.t().contiguous(), edge_attr=edge_attr, train_mask = train_mask, test_mask=test_mask)
data = data.to(device)

  after removing the cwd from sys.path.
  import sys


In [35]:
print(f'Number of nodes: {data.num_nodes}') # 节点数量
print(f'Number of edges: {data.num_edges}') # 边数量
print(f'Number of node features: {data.num_node_features}') # 节点属性的维度
print(f'Number of node features: {data.num_features}') # 同样是节点属性的维度
print(f'Number of edge features: {data.num_edge_features}') # 边属性的维度
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}') # 平均节点度
print(f'if edge indices are ordered and do not contain duplicate entries.: {data.is_coalesced()}') # 是否边是有序的同时不含有重复的边
print(f'Number of training nodes: {data.train_mask.sum()}') # 用作训练集的节点
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}') # 用作训练集的节点的数量
print(f'Contains isolated nodes: {data.has_isolated_nodes()}') # 此图是否包含孤立的节点
print(f'Contains self-loops: {data.has_self_loops()}')  # 此图是否包含自环的边
print(f'Is undirected: {data.is_undirected()}')  # 此图是否是无向图

Number of nodes: 10000
Number of edges: 53
Number of node features: 773
Number of node features: 773
Number of edge features: 1
Average node degree: 0.01
if edge indices are ordered and do not contain duplicate entries.: True
Number of training nodes: 2224
Training node label rate: 0.22
Contains isolated nodes: True
Contains self-loops: False
Is undirected: False


In [36]:
input_size = len(dataset[0][0])

Node2vec = GNN(input_size, hidden_size, output_size).to(device)
optimizer = torch.optim.Adam(Node2vec.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()

Node2vec.train()
ls=[]
for epoch in range(epochs):
    optimizer.zero_grad()
    out = Node2vec(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    test_loss = loss_fn(out[data.test_mask], data.y[data.test_mask]).item()
    ls.append(test_loss)
    loss.backward()
    optimizer.step()

In [37]:
Node2vec.eval()
pred = Node2vec(data).argmax(dim=1)
out = Node2vec(data)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
# print(out[data.test_mask])
loss = loss_fn(out[data.test_mask], data.y[data.test_mask]).item()
acc = int(correct) / int(data.test_mask.sum())
print(f'loss:{loss:.4f}')
Pred = pred[data.test_mask].cpu()
Y = data.y[data.test_mask].cpu()
C_Mat = metrics.confusion_matrix(Y, Pred)
accuracy = metrics.accuracy_score(Y,Pred)
f1 = metrics.f1_score(Y, Pred)
recall = metrics.recall_score(Y, Pred)
precision = metrics.precision_score(Y, Pred)
print(C_Mat)
print(f"acc:{accuracy:.4f}, f1:{f1:.4f}, recall:{recall:.4f}, prec:{precision:.4f}")


loss:0.6534
[[1040  670]
 [ 112  178]]
acc:0.6090, f1:0.3128, recall:0.6138, prec:0.2099


# Save&Load Model

## 模版

In [None]:
# 保存模型权重至当前文件夹
torch.save(model.state_dict(), 'model_weights.pth')                             

In [None]:
model = SimpleNet(input_size, hidden_size, output_size)                         # 需要是同一个模型
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

'''
be sure to call model.eval() method before inferencing to set the dropout and batch normalization layers to evaluation mode. 
Failing to do this will yield inconsistent inference results.
'''

In [None]:
# 直接保存/加载整个模型
torch.save(model, 'model.pth')

In [None]:
model = torch.load('model.pth')

## 此次特例

In [29]:
checkpoint = torch.load('./runs/geass/models/best.pth.tar')
print(checkpoint.keys())								
state_dict = checkpoint['state_dict']

dict_keys(['state_dict', 'optimizer', 'epoch', 'train_iteration', 'val_iteration', 'lowest_train_loss', 'lowest_val_loss', 'drawer_state', 'statable'])


In [30]:
# 加载模型
model = MLPNet(input_size, hidden_size, output_size)                         
model.load_state_dict(state_dict)

<All keys matched successfully>

In [31]:

# 看一下模型效果
model = model.to(device)
test_loop(train_dataloader, model, loss_fn)                               
test_loop(test_dataloader, model, loss_fn) 

avg_loss:0.6237
[[978 134]
 [649 463]]
acc:0.6479, f1:0.5418, recall:0.4164, prec:0.7755
avg_loss:0.5028
[[1441  269]
 [ 177  113]]
acc:0.7770, f1:0.3363, recall:0.3897, prec:0.2958


# Discoveries
1. SimpleNet的训练情况似乎说明**模型并不能从数据中学到什么东西（特征可能没有提供什么信息）**。
线性模型和MLP跑出来几乎没差别，说明问题不在模型上面，基本可以断定是数据（输入特征）的问题。