<a href="https://colab.research.google.com/github/Zheng-Ao/Colab-Notebooks/blob/main/P0_v05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 全局设置

In [None]:
'''常用库导入'''
import random
random.seed(85)
from tqdm import tqdm
import numpy as np
import pandas as pd
raw_data_path = "drive/MyDrive/P0/T10K.csv"
# 测试谷歌云端硬盘是否成功加载:
# pd.read_csv(raw_data_path)

from sklearn import metrics

import torch
print(torch.__version__)                                                        # 用于确定PyG的安装
from torch import nn
from torch.utils.data import Dataset, Subset, ConcatDataset, DataLoader
from torch.nn import functional as F

# 检查GPU:
# !nvidia-smi
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

1.12.0+cu113
cuda:0


In [None]:
%%capture
'''特殊库安装(魔法函数要放在Cell的最开始)'''
# 我不想输出安装信息，虽然%%capture不是用来干这个的，但它能达到我想要的效果
!pip install transformers
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-geometric

# 数据探索，预处理，特征工程

## 原始数据转换为分析数据

In [None]:
raw_data = pd.read_csv(raw_data_path, index_col="Unnamed: 0")

# 将3个字符串变成数值
for str in ["inventors","assignees","IPCs"]:
    for i in range(len(raw_data[str])):
        if str == "assignees":
            if raw_data[str].values[i] == "[{'assignee_sequence': None, 'assignee_key_id': None}]":
                raw_data[str].values[i] = 0
                continue
        raw_data[str].values[i] = len(eval(raw_data[str].values[i]))

for str in ["inventors","assignees","IPCs"]:
    raw_data[str] = raw_data[str].astype(int)

# raw_data.info()
# raw_data.describe()

## 数据清洗

In [None]:
'''本次清洗原则：低被引，皆可删
注意，清洗之后要重排id列，这是建图的必须！
默认的索引也要重新改！否则无法正常的进行索引操作！比如原来index=2800的被删掉了，那么df.at[2800,"someattr"]就会报错！！！
'''

# raw_data.info()
# 发现"patent_abstract"一列有缺失值，定位看看:
raw_data[raw_data.isnull().values == True]
# 发现是无关紧要的负样本，直接将它们删掉(10000->9993):
raw_data.dropna(inplace = True)
raw_data.reset_index(drop=True, inplace=True)
# raw_data.info()
# raw_data.describe()
# 发现后向引用数的min为0，存在没有后向引用的专利吗？定位看看:
# raw_data[raw_data["patent_num_us_patent_citations"].values == 0]
# 这些专利并非后向引用为0，而是引用了外国的专利，而非美国的专利。
# raw_data[raw_data["patent_num_us_patent_citations"].values == 1]
# 这些专利还是有一定被引数的，因此保留下来
num_patents = len(raw_data)
raw_data.insert(0, 'index', range(num_patents), allow_duplicates=False)

print(f"最终有{num_patents}个专利")

最终有9993个专利


In [None]:
raw_data

Unnamed: 0,index,patent_number,patent_date,patent_num_cited_by_us_patents,patent_title,patent_abstract,patent_num_claims,patent_num_us_patent_citations,inventors,assignees,IPCs,cited_patents
0,0,3930276,1976-01-06,4,Wheel spinning and vehicle conveying apparatus...,An automobile conveyor for use in conjunction ...,12,2,1,1,1,"[{'cited_patent_number': '3037223', 'cited_pat..."
1,1,3930279,1976-01-06,5,Rubber windshield wiper blades having increase...,A rubber windshield wiper blade is clamped to ...,2,3,1,0,1,"[{'cited_patent_number': '2140453', 'cited_pat..."
2,2,3930323,1976-01-06,17,Chain tensioning mechanism for scraper elevato...,A tensioning mechanism for the chain of a scra...,2,4,2,1,2,"[{'cited_patent_number': '372157', 'cited_pate..."
3,3,3930526,1976-01-06,14,Pneumatic tire and wheel assemblies,A pneumatic tire and wheel assembly comprises ...,21,7,1,1,1,"[{'cited_patent_number': '1359461', 'cited_pat..."
4,4,3930527,1976-01-06,19,Tire and wheel assembly,A wheel having a pair of spaced-apart seats fo...,8,13,1,1,4,"[{'cited_patent_number': '1921772', 'cited_pat..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9988,9988,4358129,1982-11-09,1,Tractor with a built-on underframe for a tilli...,The back part of the underframe is fastened to...,2,4,1,0,1,"[{'cited_patent_number': '2707643', 'cited_pat..."
9989,9989,4358133,1982-11-09,32,Adjustable width trailer,A trailer frame having fixed wheel track is pr...,6,6,1,0,1,"[{'cited_patent_number': '3239274', 'cited_pat..."
9990,9990,4358135,1982-11-09,22,Connector for igniting circuit of priming device,A connector provided in an igniting circuit fo...,9,7,4,1,2,"[{'cited_patent_number': '3509297', 'cited_pat..."
9991,9991,4358136,1982-11-09,32,Energy absorbing device for use with vehicular...,An energy absorbing device for use with a vehi...,7,7,4,1,1,"[{'cited_patent_number': '3547468', 'cited_pat..."


## 看一看正负样本分别长什么样

In [None]:
# raw_data.nlargest(10,"patent_num_cited_by_us_patents")
pos_some_data = raw_data[raw_data["patent_num_cited_by_us_patents"]>40]
pos_some_data.info()
pos_some_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 9 to 9986
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   index                           534 non-null    int64 
 1   patent_number                   534 non-null    object
 2   patent_date                     534 non-null    object
 3   patent_num_cited_by_us_patents  534 non-null    int64 
 4   patent_title                    534 non-null    object
 5   patent_abstract                 534 non-null    object
 6   patent_num_claims               534 non-null    int64 
 7   patent_num_us_patent_citations  534 non-null    int64 
 8   inventors                       534 non-null    int64 
 9   assignees                       534 non-null    int64 
 10  IPCs                            534 non-null    int64 
 11  cited_patents                   534 non-null    object
dtypes: int64(7), object(5)
memory usage: 54.2+ KB


Unnamed: 0,index,patent_num_cited_by_us_patents,patent_num_claims,patent_num_us_patent_citations,inventors,assignees,IPCs
count,534.0,534.0,534.0,534.0,534.0,534.0,534.0
mean,5373.569288,63.183521,11.292135,6.790262,1.614232,0.662921,1.485019
std,2930.359191,32.245424,11.342066,4.091016,0.961288,0.514929,0.800305
min,9.0,41.0,1.0,0.0,1.0,0.0,1.0
25%,3042.0,46.0,5.0,4.0,1.0,0.0,1.0
50%,5578.5,52.0,8.0,6.0,1.0,1.0,1.0
75%,7960.0,68.75,14.0,9.0,2.0,1.0,2.0
max,9979.0,284.0,123.0,35.0,6.0,3.0,5.0


**可以看到基本上没什么规律，除非能够很好地理解文本，否则很难自动地将它们挑出来。**

## 将分析数据转换为输入特征

### 数值特征归一化

In [None]:
for str in ["patent_num_claims","patent_num_us_patent_citations","inventors","assignees","IPCs"]:
    raw_data[str] = (raw_data[str] - raw_data[str].mean())/raw_data[str].std()

In [None]:
# raw_data

### 文本特征提取

In [None]:
'''构建txt_vecs，作为Dataset的第二个数据来源'''
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")      
nlp_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
'''加载BERT这种大模型很耗时，因此在整个流程中应当让上面两行代码只执行一次。'''

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'加载BERT这种大模型很耗时，因此在整个流程中应当让上面两行代码只执行一次。'

In [None]:
'''此循环较贵(num_patents/150 秒)，但无法避免'''
for i in tqdm(range(num_patents)):
    ttl = raw_data["patent_title"].values[i]
    abst = raw_data["patent_abstract"].values[i]
    txt_input = ttl + " " + abst
    inputs = tokenizer(txt_input, return_tensors="pt").to(device)
    if len(inputs['input_ids'][0]) > 512:
        outputs = nlp_model(input_ids = inputs['input_ids'][0, :510], attention_mask = inputs['attention_mask'][0, :510])
    else:
        outputs = nlp_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    cls_vec = last_hidden_states[:,0,:].clone().detach()
    if i == 0:
        txt_vecs = cls_vec
    else:
        txt_vecs = torch.cat([txt_vecs, cls_vec],dim=0)

print(txt_vecs.shape)
# should be [num_samples, embedding_dim]

  3%|▎         | 315/9993 [00:03<01:52, 86.19it/s] 


RuntimeError: ignored

In [None]:
Mat = txt_vecs.cpu().numpy()

from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

# txt_pca = PCA(n_components=0.9, svd_solver = 'full')
txt_pca = PCA(n_components = 5)

In [None]:
# 查看PCA的效果:
txt_pca.fit(Mat)
var = txt_pca.explained_variance_ratio_
print(f"PCA之后的维度：{len(var)}，PCA后的方差保留：{var.sum():.4f}")

PCA之后的维度：5，PCA后的方差保留：0.4711


In [None]:
# PCA
Txt_Embedding_c = torch.tensor(txt_pca.fit_transform(Mat), dtype = torch.float32)
print(Txt_Embedding_c.shape)
# should be [10000, 5] here

# without PCA
Txt_Embedding = txt_vecs.clone()
print(Txt_Embedding.shape, Txt_Embedding.dtype)

torch.Size([9993, 5])
torch.Size([9993, 768]) torch.float32


### 图特征提取

In [None]:
# edge_index, edge_attr
from datetime import datetime

adj_ls = []
edges = []

for i in tqdm(range(num_patents)):
    id = raw_data.values[i][0]
    b_cits = eval(raw_data["cited_patents"].values[i])
    for b_cit in b_cits:
        # 是否在专利数据库中
        if b_cit["cited_patent_date"]:
            # 是否在本数据集中
            if b_cit["cited_patent_number"] in raw_data["patent_number"].values:
                # print(i)
                cit_id = raw_data[raw_data["patent_number"].values == b_cit["cited_patent_number"]].values[0][0]
                adj_ls.append([id, cit_id])
                date = raw_data["patent_date"].values[id]
                cit_date = raw_data["patent_date"].values[cit_id]
                date1 = datetime.strptime(date, "%Y-%m-%d")
                date2 = datetime.strptime(cit_date, "%Y-%m-%d")
                dist = date1-date2
                '''边的权重设置为365/间隔天数'''
                dist = 365/dist.days
                edges.append(dist)

100%|██████████| 9993/9993 [00:32<00:00, 305.09it/s]


In [None]:
# adj_ls[0:10], edges[0:10]
'''edges需要做归一化吗？'''

'edges需要做归一化吗？'

# UTILS

In [None]:
# target transform, 0:neg, 1:pos
def LabelCorePa(ref, isd):
    now = datetime.strptime("2022-01-01", "%Y-%m-%d").year
    years = now - datetime.strptime(isd, "%Y-%m-%d").year
    score = ref/years       
    label = int((score>0.5))
    return label

# Dataset
class PatDataset(Dataset):
    def __init__(self, raw_data, txt_vecs = None, transform = None, target_transform = LabelCorePa):
        super().__init__()
        self.raw_data = raw_data
        self.txt_vecs = txt_vecs
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        # Y          
        ref = self.raw_data["patent_num_cited_by_us_patents"].values[idx]
        isd = self.raw_data.at[idx, "patent_date"]
        if self.target_transform:
            label = self.target_transform(ref, isd)
            label = torch.tensor(label,dtype=torch.long)

        # X
        # INDEXs
        num_claims = self.raw_data.at[idx, "patent_num_claims"]
        num_b_cits = self.raw_data.at[idx, "patent_num_us_patent_citations"]
        inventors = self.raw_data.at[idx, "inventors"]
        # num_inventors = len(eval(inventors))
        assignees = self.raw_data.at[idx, "assignees"]
        # if assignees == "[{'assignee_sequence': None, 'assignee_key_id': None}]":
        #     num_assignees = 0
        # else:
        #     num_assignees = len(eval(assignees))
        IPCs = self.raw_data.at[idx, "IPCs"]
        # num_ipcs = len(eval(IPCs))
        indexs = torch.tensor([num_claims, num_b_cits, inventors, assignees, IPCs], dtype=torch.float32).to(device)
        patent = indexs.clone()
        # TXT
        if self.txt_vecs != None:
            txt_vec = self.txt_vecs[idx].clone()
            txt_vec = txt_vec.to(device)
            patent = torch.cat([indexs, txt_vec])
        
        return patent, label

# Models
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            # nn.BatchNorm1d(input_size),
            # 不加BatchNorm效果更好，事实证明：不要去玩自己不会的东西
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            # nn.Linear(hidden_size, 128),
            # nn.ReLU(),
            # nn.Dropout(p=0.5),
            # nn.Linear(128, 32),
            # nn.ReLU(),
            # nn.Dropout(p=0.5),
            # nn.Linear(32, 8),
            # nn.ReLU(),
            # nn.Dropout(p=0.5),
            # nn.Linear(8, output_size)
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

from torch_geometric.nn import GCNConv

class GNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.conv1 = GCNConv(input_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        logits = self.conv2(x, edge_index)                   # (num_nodes, 2)
        return logits                  # (num_nodes, 2)



# Train
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):

        # batch: 第几个batch；X: 包含batch_size个feature vec.
        X = X.to(device)
        y = y.to(device)

        output = model(X).to(device)
        loss = loss_fn(output, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 20 == 0:                                                     # train_size/batch_size = 100, 每20个batch输出一次结果，共输出5次。
            loss, current = loss.item(), batch * len(X)                         
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# Test
def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            output = model(X)
            test_loss += loss_fn(output, y).item()
            # correct += (output.argmax(1) == y).type(torch.float).sum().item()
            
            pred = output.argmax(1).cpu()
            y = y.cpu()
            if batch == 0:
                Pred = pred
                Y = y
            else:
                Pred = torch.cat((Pred, pred), dim = 0)
                Y = torch.cat((Y, y), dim=0)

    C_Mat = metrics.confusion_matrix(Y, Pred)
    accuracy = metrics.accuracy_score(Y,Pred)
    f1 = metrics.f1_score(Y, Pred)
    recall = metrics.recall_score(Y, Pred)
    precision = metrics.precision_score(Y, Pred)
    print(C_Mat)
    print(f"{test_loss:.4f}")
    print(f"acc:{accuracy:.4f}, f1:{f1:.4f}, recall:{recall:.4f}, prec:{precision:.4f}")

    # test_loss /= num_batches
    # correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

# CONFIG

In [None]:
# HyperParams----Config.py
hidden_size = 16
output_size = 2

learning_rate = 1e-3
weight_decay = 5e-4
batch_size = 64
epochs = 50
num_epochs_to_print = epochs/10                                                 # 每隔10次输出一次Metrics

num_train = 8000
num_test = num_patents - num_train

# Main

## MLP

In [None]:
# PIPLINE
# DATA TO FIT A MODEL
dataset = PatDataset(raw_data=raw_data, txt_vecs = Txt_Embedding)
training_indices = [i for i in range(num_train)]
test_indices = [i for i in range(num_train,num_train+num_test)]
training_data_all = Subset(dataset, training_indices)
test_data = Subset(dataset, test_indices)

# Make pos:neg in training set 1:1
pos_indices = []
neg_indices = []
for i in range(num_train):
    if training_data_all[i][1] == 1:
        pos_indices.append(i)
    else:
        neg_indices.append(i)
num_pos = len(pos_indices)
num_neg = len(neg_indices)
print(f"NEG:POS = {num_neg}:{num_pos} = {num_neg/num_pos:.2f}")
neg_indices_sample = np.random.choice(neg_indices, num_pos, replace = False)	 
training_data_pos = Subset(training_data_all, pos_indices)
training_data_neg = Subset(training_data_all, neg_indices_sample)
training_data = ConcatDataset([training_data_pos, training_data_neg])
print("Training sample number:",len(training_data))

train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

NEG:POS = 6631:1369 = 4.84
Training sample number: 2738
Feature batch shape: torch.Size([64, 773])
Labels batch shape: torch.Size([64])


In [None]:
# FIT A MODEL
# PyTorch的逻辑是先初始化（喂超参），再进行函数计算（喂输入）
input_size = len(dataset[0][0])
model = SimpleNet(input_size, hidden_size, output_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

for t in range(epochs):
    train_loop(train_dataloader, model, loss_fn, optimizer)
    if t==0 or (t+1)%num_epochs_to_print == 0:
        print(f"Epoch {t+1}\n-------------------------------")
        test_loop(train_dataloader, model, loss_fn)                                 # performance on training data
        test_loop(test_dataloader, model, loss_fn)                                  # performance on test data
print("Done!")

Epoch 1
-------------------------------
[[ 255 1114]
 [ 102 1267]]
29.5229
acc:0.5559, f1:0.6757, recall:0.9255, prec:0.5321
[[ 259 1295]
 [  38  401]]
22.7404
acc:0.3312, f1:0.3756, recall:0.9134, prec:0.2364
Epoch 10
-------------------------------
[[929 440]
 [536 833]]
27.2733
acc:0.6435, f1:0.6306, recall:0.6085, prec:0.6544
[[951 603]
 [174 265]]
21.2183
acc:0.6101, f1:0.4055, recall:0.6036, prec:0.3053
Epoch 20
-------------------------------
[[873 496]
 [414 955]]
26.4071
acc:0.6676, f1:0.6773, recall:0.6976, prec:0.6582
[[873 681]
 [148 291]]
22.2760
acc:0.5840, f1:0.4125, recall:0.6629, prec:0.2994
Epoch 30
-------------------------------
[[975 394]
 [465 904]]
25.5535
acc:0.6863, f1:0.6779, recall:0.6603, prec:0.6965
[[961 593]
 [167 272]]
21.2876
acc:0.6187, f1:0.4172, recall:0.6196, prec:0.3145
Epoch 40
-------------------------------
[[908 461]
 [391 978]]
24.8267
acc:0.6888, f1:0.6966, recall:0.7144, prec:0.6796
[[878 676]
 [160 279]]
22.3954
acc:0.5805, f1:0.4003, recal

## PyG GNN

In [None]:
# data.x, data.y
for i in range(num_patents):
    if i == 0:
        x = dataset[i][0].unsqueeze(0)
        y = dataset[i][1].unsqueeze(0)
    else:
        x = torch.cat([x, dataset[i][0].unsqueeze(0)])
        y = torch.cat([y, dataset[i][1].unsqueeze(0)])

print(x.shape, x.dtype, y.shape, y.dtype)

torch.Size([9993, 773]) torch.float32 torch.Size([9993]) torch.int64


In [None]:
print(len(pos_indices),len(neg_indices_sample))
print(neg_indices_sample)

In [None]:
from torch_geometric.data import Data

# num_nodes, num_node_features
x = torch.tensor(x, dtype=torch.float32)

# num_nodes, 1
y = torch.tensor(y, dtype=torch.long)

# 2, num_edges
edge_index = torch.tensor(adj_ls, dtype=torch.long)

# num_edges, num_edge_features
edge_attr = torch.tensor(edges, dtype=torch.float32)

# train-test split
train_ls = [False]*num_train + [False]*num_test
train_mask = torch.tensor(train_ls, dtype=bool)
train_mask[pos_indices] = True
train_mask[neg_indices_sample] = True
test_ls = [False]*num_train + [True]*num_test
test_mask = torch.tensor(test_ls, dtype=bool)


data = Data(x=x, y=y, edge_index = edge_index.t().contiguous(), edge_attr=edge_attr, train_mask = train_mask, test_mask=test_mask)
data = data.to(device)

  after removing the cwd from sys.path.
  import sys


In [None]:
print(f'Number of nodes: {data.num_nodes}') # 节点数量
print(f'Number of edges: {data.num_edges}') # 边数量
print(f'Number of node features: {data.num_node_features}') # 节点属性的维度
print(f'Number of node features: {data.num_features}') # 同样是节点属性的维度
print(f'Number of edge features: {data.num_edge_features}') # 边属性的维度
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}') # 平均节点度
print(f'if edge indices are ordered and do not contain duplicate entries.: {data.is_coalesced()}') # 是否边是有序的同时不含有重复的边
print(f'Number of training nodes: {data.train_mask.sum()}') # 用作训练集的节点
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}') # 用作训练集的节点的数量
print(f'Contains isolated nodes: {data.has_isolated_nodes()}') # 此图是否包含孤立的节点
print(f'Contains self-loops: {data.has_self_loops()}')  # 此图是否包含自环的边
print(f'Is undirected: {data.is_undirected()}')  # 此图是否是无向图

Number of nodes: 9993
Number of edges: 5560
Number of node features: 773
Number of node features: 773
Number of edge features: 1
Average node degree: 0.56
if edge indices are ordered and do not contain duplicate entries.: True
Number of training nodes: 2738
Training node label rate: 0.27
Contains isolated nodes: True
Contains self-loops: False
Is undirected: False


In [None]:
input_size = len(dataset[0][0])

model = GNN(input_size, hidden_size, output_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = nn.CrossEntropyLoss()

model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')
Pred = pred[data.test_mask].cpu()
Y = data.y[data.test_mask].cpu()
C_Mat = metrics.confusion_matrix(Y, Pred)
accuracy = metrics.accuracy_score(Y,Pred)
f1 = metrics.f1_score(Y, Pred)
recall = metrics.recall_score(Y, Pred)
precision = metrics.precision_score(Y, Pred)
print(C_Mat)
print(f"acc:{accuracy:.4f}, f1:{f1:.4f}, recall:{recall:.4f}, prec:{precision:.4f}")


Accuracy: 0.6754
[[1143  411]
 [ 236  203]]
acc:0.6754, f1:0.3856, recall:0.4624, prec:0.3306


# Save&Load Model

In [None]:
# 保存模型权重至当前文件夹
torch.save(model.state_dict(), 'model_weights.pth')                             

In [None]:
model = SimpleNet(input_size, hidden_size, output_size)                         # 需要是同一个模型
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

'''
be sure to call model.eval() method before inferencing to set the dropout and batch normalization layers to evaluation mode. 
Failing to do this will yield inconsistent inference results.
'''

In [None]:
# 直接保存/加载整个模型
torch.save(model, 'model.pth')

In [None]:
model = torch.load('model.pth')

# Discoveries
1. SimpleNet的训练情况似乎说明**模型并不能从数据中学到什么东西（特征可能没有提供什么信息）**。
线性模型和MLP跑出来几乎没差别，说明问题不在模型上面，基本可以断定是数据（输入特征）的问题。

1. GNN的训练好像比SimpleNet快很多，可能是train_loop写得太复杂了。

In [None]:
t = raw_data["patent_title"].values[0] +" " +raw_data["patent_abstract"].values[0]



In [None]:
i = tokenizer(t, return_tensors="pt")

In [None]:
i, i.input_ids[0, :128]

({'input_ids': tensor([[  101,  5217,  9419,  1998,  4316, 16636,  2075, 14709,  2005,  6882,
           5217,  9378,  2545,  2019,  9935, 16636,  2953,  2005,  2224,  1999,
           9595,  2007,  1037,  5217,  9419,  5080,  2005,  2019,  6882,  5217,
           9378,  2121,  2164,  2019, 10866,  4677,  2383,  1037, 29018,  1997,
          13228,  2135,  8526,  3085,  6077, 20369,  2135,  7119,  2045,  3406,
           1012,  1996,  6077,  5373,  3604,  1999,  1037,  2597,  2000,  9075,
           2019,  8285,  2083,  1996,  9378,  2121,  2073,  4017,  1996,  2041,
           6277,  2203,  2003,  1999,  2485,  2523,  2007,  1996,  4677,  1998,
           4218,  1996,  9935,  2034,  4637,  3302,  1012,  4082,  2965,  2024,
           2443, 13557,  2043,  2019,  9935,  2003,  1999,  2597,  2012,  2560,
           2028,  1997,  1996,  6077,  2097,  2022,  8073,  2333,  2000,  1037,
           2597,  2073,  1996,  3899,  8908,  2682,  1996,  9935,  4637,  3302,
           1010, 10402,  1