<a href="https://colab.research.google.com/github/Zheng-Ao/Colab-Notebooks/blob/main/P0_v05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# library install & cuda device

In [None]:
!python -c "import torch; print(torch.__version__)"
# !python -c "import torch; print(torch.version.cuda)"

In [None]:
!pip install transformers
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
# !pip install torch-geometric

In [None]:
!nvidia-smi
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# utils.py

## Txt2Vec Matrix, PCA

In [None]:
raw_data_path = "drive/MyDrive/P0/T10K.csv"

# 构建出txt_vecs，作为Dataset的第二个数据来源
import pandas as pd
from tqdm import tqdm

from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")      
nlp_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

'''加载BERT这种大模型很耗时，因此在整个流程中应当让上面两行代码只执行一次。'''


df = pd.read_csv(raw_data_path)

for i in tqdm(range(10000)):
    ttl = df["patent_title"].values[i]
    inputs = tokenizer(ttl, return_tensors="pt").to(device)
    outputs = nlp_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    cls_vec = last_hidden_states[:,0,:].clone().detach()
    if i == 0:
        txt_vecs = cls_vec
    else:
        txt_vecs = torch.cat([txt_vecs, cls_vec],dim=0)

In [None]:
txt_vecs.shape

In [None]:
Mat = txt_vecs.cpu().numpy()

In [None]:
Mat.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# txt_pca = PCA(n_components=0.9, svd_solver = 'full')
txt_pca = PCA(n_components = 5)

In [None]:
# txt_pca.fit(Mat)
# var = txt_pca.explained_variance_ratio_
# len(var), var.sum()

In [None]:
new_Mat = txt_pca.fit_transform(Mat)

## utils

In [None]:
from torch.utils.data import Dataset, Subset, ConcatDataset, DataLoader
from torch import nn

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import metrics

import random

from datetime import datetime
# target transform, 0:neg, 1:pos
def LabelCorePa(ref, isd):
    now = datetime.strptime("2022-01-01", "%Y-%M-%d").year
    years = now - datetime.strptime(isd, "%Y-%M-%d").year
    score = ref/years       
    label = int((score>0.5))
    return label

# Dataset
class PatDataset(Dataset):
    def __init__(self, raw_data_path, txt_vecs, transform = None, target_transform = LabelCorePa):
        self.raw_data = pd.read_csv(raw_data_path)
        self.txt_vecs = txt_vecs
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        # Y          
        ref = self.raw_data.at[idx, "patent_num_cited_by_us_patents"]
        isd = self.raw_data.at[idx, "patent_date"]
        if self.target_transform:
            label = self.target_transform(ref, isd)
            label = torch.tensor(label,dtype=torch.long)

        # X
        # INDEXs
        num_claims = self.raw_data.at[idx, "patent_num_claims"]
        b_cits = self.raw_data.at[idx, "patent_num_us_patent_citations"]
        inventors = self.raw_data.at[idx, "inventors"]
        num_inventors = len(eval(inventors))
        assignees = self.raw_data.at[idx, "assignees"]
        if assignees == "[{'assignee_sequence': None, 'assignee_key_id': None}]":
            num_assignees = 0
        else:
            num_assignees = len(eval(assignees))
        IPCs = self.raw_data.at[idx, "IPCs"]
        num_ipcs = len(eval(IPCs))
        indexs = torch.tensor([num_claims, b_cits, num_inventors, num_assignees, num_ipcs], dtype=torch.float32).to(device)
        # TXT
        txt_vec = torch.tensor(self.txt_vecs[idx], dtype = torch.float32)
        txt_vec = txt_vec.to(device)

        patent = torch.cat([indexs, txt_vec])
        
        return patent, label

# Models
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits




# Train
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):

        # batch: 第几个batch；X: 包含batch_size个feature vec.
        X = X.to(device)
        y = y.to(device)

        output = model(X).to(device)
        loss = loss_fn(output, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 20 == 0:                                                     # train_size/batch_size = 100, 每20个batch输出一次结果，共输出5次。
            loss, current = loss.item(), batch * len(X)                         
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# Test
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            output = model(X)
            # test_loss += loss_fn(output, y).item()
            # correct += (output.argmax(1) == y).type(torch.float).sum().item()
            
            pred = output.argmax(1).cpu()
            y = y.cpu()
            if batch == 0:
                Pred = pred
                Y = y
            else:
                Pred = torch.cat((Pred, pred), dim = 0)
                Y = torch.cat((Y, y), dim=0)

    C_Mat = metrics.confusion_matrix(Y, Pred)
    accuracy = metrics.accuracy_score(Y,Pred)
    f1 = metrics.f1_score(Y, Pred)
    recall = metrics.recall_score(Y, Pred)
    precision = metrics.precision_score(Y, Pred)
    print(C_Mat)
    print(f"acc:{accuracy:.4f}, f1:{f1:.4f}, recall:{recall:.4f}, prec:{precision:.4f}")

    # test_loss /= num_batches
    # correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

# config.py

In [None]:
# HyperParams----Config.py
hidden_size = 32
output_size = 2

learning_rate = 1e-3
weight_decay = 5e-4
batch_size = 64                                                                 # test_size/batch_size = 25, 25 batches.
epochs = 10

num_train = 8000
num_test = 2000

# Main

In [None]:
new_Mat.shape

In [None]:
# PIPLINE
# DATA TO FIT A MODEL
dataset = PatDataset(raw_data_path=raw_data_path, txt_vecs = new_Mat)
training_indices = [i for i in range(num_train)]
test_indices = [i for i in range(num_train,num_train+num_test)]
training_data_all = Subset(dataset, training_indices)
test_data = Subset(dataset, test_indices)

# Make pos:neg in training set 1:1
pos_indices = []
neg_indices = []
for i in range(num_train):
    if training_data_all[i][1] == 1:
        pos_indices.append(i)
    else:
        neg_indices.append(i)
num_pos = len(pos_indices)
num_neg = len(neg_indices)
print(f"{num_neg}:{num_pos} = {num_neg/num_pos}")
neg_indices_sample = random.choices(neg_indices, k=num_pos)	 
training_data_pos = Subset(training_data_all, pos_indices)
training_data_neg = Subset(training_data_all, neg_indices_sample)
training_data = ConcatDataset([training_data_pos, training_data_neg])
print("Training sample number:",len(training_data))

train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

In [None]:
# FIT A MODEL
# PyTorch的逻辑是先初始化（喂超参），再进行函数计算（喂输入）
input_size = len(dataset[0][0])
model = SimpleNet(input_size, hidden_size, output_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(train_dataloader, model, loss_fn)                                 # performance on training data
    test_loop(test_dataloader, model, loss_fn)                                  # performance on test data
print("Done!")

# Save&Load Model

In [None]:
# 保存模型权重至当前文件夹
torch.save(model.state_dict(), 'model_weights.pth')                             

In [None]:
model = SimpleNet(input_size, hidden_size, output_size)                         # 需要是同一个模型
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

'''
be sure to call model.eval() method before inferencing to set the dropout and batch normalization layers to evaluation mode. 
Failing to do this will yield inconsistent inference results.
'''

In [None]:
# 直接保存/加载整个模型
torch.save(model, 'model.pth')

In [None]:
model = torch.load('model.pth')

# Discoveries

事实证明，只用ttl几乎相当于没有给模型提供有用信息，模型倾向于只预测其中一类。