# Task description
- Classify the patent documents based on abstract
- use BERT

#Install packages

In [2]:
!pwd

/content


In [3]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25h

#Import Packages

In [4]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#Read Data

In [None]:
df = pd.read_csv('training_speech_ai_0205.csv')

In [4]:
df=pd.read_excel('speech_training_set_0522_2023.xlsx')

In [5]:
dft=pd.read_excel('speech_test_set_0522_2023.xlsx')

In [None]:
dft=pd.read_excel('speech_test_set_0711_2023.xlsx')

In [6]:
dft

Unnamed: 0,doc_id,Abstract (English),predict50_speech,flag_train_speech,ai_score_speech
0,20040162741,A system and method for supporting Product Lif...,1,0,0.794668
1,20180129796,Embodiments of the invention provide methods a...,0,0,0.002712
2,8078469,A distributed voice user interface system incl...,1,0,1.000000
3,20180253228,A point-of-sale (POS) terminal includes a firs...,0,0,0.000693
4,9406090,A method and apparatus for sharing captured me...,0,0,0.027092
...,...,...,...,...,...
95,20140220526,A system is configured to receive voice emotio...,1,0,0.998783
96,5576954,This is a procedure for determining text relev...,0,0,0.134995
97,20050119894,The present invention involves methods and sys...,1,0,0.999541
98,9378273,A computer-implemented method of answering que...,1,0,0.983408


#Dataset

In [7]:
class CustomDataset(Dataset):

    def __init__(self, split, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data['Abstract (English)']
        #self.targets = dataframe['grp_speech']
        self.max_len = max_len
        self.split=split

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        if self.split=='train':
          self.targets = self.data['grp_speech']
        else:
          self.targets = self.data['predict50_speech']
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())
        #print('idx:',index)
        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        for idx in range(0, len(self.targets)):
          if self.targets[idx]=='seed':
            self.targets[idx]=1
          elif self.targets[idx]=='antiseed':
            self.targets[idx]=0

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
MAX_LEN = 200
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_set = CustomDataset('train', df, tokenizer, MAX_LEN)
testing_set = CustomDataset('test', dft, tokenizer, MAX_LEN)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Dataloader
- Split dataset into training dataset and validation dataset.
- Create dataloader to iterate the data.

In [8]:
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#Model

In [9]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased',return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        #print(output_1.shape)
        #print(_.shape)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

#Training

In [10]:
# hyperparameters
EPOCHS = 1
LEARNING_RATE = 1e-05

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        #print(targets.shape)
        outputs = model(ids, mask, token_type_ids)
        #print(outputs.shape)
        #print(outputs)
        outputs=outputs.squeeze()
        #print(outputs.shape)
        #optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        #update model
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [13]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6719133853912354
Epoch: 0, Loss:  0.21731606125831604
Epoch: 0, Loss:  0.015462188050150871
Epoch: 0, Loss:  0.005395950749516487
Epoch: 0, Loss:  0.006876371800899506
Epoch: 0, Loss:  0.007694674655795097
Epoch: 0, Loss:  0.0030413588974624872
Epoch: 0, Loss:  0.001984414178878069
Epoch: 0, Loss:  0.0017491813050583005
Epoch: 0, Loss:  0.0018141977488994598
Epoch: 0, Loss:  0.0015470852376893163
Epoch: 0, Loss:  0.0022728426847606897
Epoch: 0, Loss:  0.004205962643027306
Epoch: 0, Loss:  0.0012774497736245394
Epoch: 0, Loss:  0.0019252414349466562
Epoch: 0, Loss:  0.0013222332345321774
Epoch: 0, Loss:  0.004313397221267223
Epoch: 0, Loss:  0.0006124326610006392
Epoch: 0, Loss:  0.0005438259104266763
Epoch: 0, Loss:  0.03950600326061249


In [1]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            print(targets.shape)
            print(targets)
            outputs = model(ids, mask, token_type_ids)
            print(outputs.shape)
            print(outputs)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            print(fin_targets)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(fin_outputs)
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    print(len(outputs))
    print(outputs)
    pred=np.array(outputs)
    print(pred.shape)
    print(len(targets))
    print(targets)
    p=np.zeros(100)
    for i in range(0,100):
      if pred[i,0]>0.5:
        p[i]=1
      else:
        p[i]=0
    print("Accuracy: "  + str(np.mean((p[:] == targets[:]))))
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")



torch.Size([4])
tensor([1., 1., 1., 0.], device='cuda:0')
torch.Size([4, 1])
tensor([[-7.3690],
        [ 4.2338],
        [ 4.0347],
        [-7.7058]], device='cuda:0')
[1.0, 1.0, 1.0, 0.0]
[[0.0006300753448158503], [0.9857106804847717], [0.9826168417930603], [0.00045002761180512607]]
torch.Size([4])
tensor([0., 1., 1., 0.], device='cuda:0')
torch.Size([4, 1])
tensor([[ 2.3913],
        [ 4.2328],
        [ 4.0999],
        [-7.6457]], device='cuda:0')
[1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0]
[[0.0006300753448158503], [0.9857106804847717], [0.9826168417930603], [0.00045002761180512607], [0.9161608815193176], [0.985696017742157], [0.9836959838867188], [0.0004778534348588437]]
torch.Size([4])
tensor([0., 1., 0., 0.], device='cuda:0')
torch.Size([4, 1])
tensor([[ 4.0846],
        [ 1.8849],
        [ 2.2133],
        [-7.4383]], device='cuda:0')
[1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[[0.0006300753448158503], [0.9857106804847717], [0.9826168417930603], [0.000450027