In [1]:
pip install datasets

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup,TrainingArguments, Trainer
from datasets import load_metric
from transformers import BertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


In [4]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
sub_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [5]:
class Config:
   # model
    model = 'anferico/bert-for-patents'
    
    max_len = 32
    num_epoch = 2
    batch_size = 64
    epochs = 2
    lr = 1e-6

In [6]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print( 'device set to =>', device)

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
device set to => cuda


In [8]:
class PatentDataset( torch.utils.data.Dataset ):
    def __init__( self, anchor, target, context, score, tokenizer, max_len ):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.score = score
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__( self ):
        return len( self.anchor )
    
    def __getitem__( self, idx ):
        anchor = self.anchor[idx ]
        target = self.target[ idx ]
        context = self.context[ idx ]
        score = self.score[ idx ]
        
        encoded_data = self.tokenizer.encode_plus(
                    context+ ' ' + anchor,
                    target,
                    padding = 'max_length',
                    max_length = self.max_len,
                    truncation=True,
                    return_attention_mask = True,
        )
        input_ids = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]
        token_type_ids = encoded_data["token_type_ids"]
        
        return {
            'input_ids' : torch.tensor( input_ids, dtype= torch.long),
            'attention_mask': torch.tensor( attention_mask, dtype=torch.long),
            'token_type': torch.tensor( token_type_ids, dtype=torch.long),
            'label': torch.tensor( score, dtype=torch.long)
        }

In [None]:
class PatentTestDataset( torch.utils.data.Dataset ):
    def __init__( self, anchor, target, context, tokenizer, max_len ):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__( self ):
        return len( self.anchor )
    
    def __getitem__( self, idx ):
        anchor = self.anchor[idx ]
        target = self.target[ idx ]
        context = self.context[ idx ]
        
        encoded_data = self.tokenizer.encode_plus(
                    context+ ' ' + anchor,
                    target,
                    padding = 'max_length',
                    max_length = self.max_len,
                    truncation=True,
                    return_attention_mask = True,
        )
        input_ids = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]
        token_type_ids = encoded_data["token_type_ids"]
        
        return {
            'input_ids' : torch.tensor( input_ids, dtype= torch.long),
            'attention_mask': torch.tensor( attention_mask, dtype=torch.long),
            'token_type': torch.tensor( token_type_ids, dtype=torch.long),
        }

In [9]:
class BertClassifier(nn.Module):
    def __init__( self, dropout ):
        super( BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(Config.model)
        self.dropout = nn.Dropout( dropout )
        self.linear = nn.Linear( 1024, 5 )
        self.relu = nn.ReLU()
        
    def forward( self,  input_id, mask ):
        _, pooled_output = self.bert( input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout( pooled_output )
        linear_output = self.linear( dropout_output )
        final_layer = self.relu( linear_output )
        return final_layer

In [10]:
tokenizer = AutoTokenizer.from_pretrained( Config.model , 
                                            padding='max_length',
                                            pad_to_max_length = True,
                                            max_length = Config.max_len,
                                            truncation=True)   

Downloading:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/322k [00:00<?, ?B/s]

In [11]:
train_examples = int(len(train_df) * 0.9)
train_data = train_df.iloc[ :train_examples, 1:]
val_data = train_df.iloc[ train_examples:, 1:]
print( 'train_size:', train_data.shape[0] )
print( 'val_size:', val_data.shape[0] )

train_size: 32825
val_size: 3648


In [12]:
def train(model, train, val, learning_rate=.01, epochs=2):
    train_dataset = PatentDataset(
                    anchor = train.anchor.values,
                    target = train.target.values,
                    context = train.context.values,
                    score = train.score.values,
                    tokenizer = tokenizer,
                    max_len = Config.max_len
    )

    val_dataset = PatentDataset(
                        anchor = val.anchor.values,
                        target = val.target.values,
                        context = val.context.values,
                        score = val.score.values,
                        tokenizer = tokenizer,
                        max_len = Config.max_len
    )
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for item in tqdm(train_dataloader):
            train_label = item['label'].to(device)
            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)


            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
                
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

        with torch.no_grad():

            for item in val_dataloader:

                val_label = item['label'].to(device)
                mask = item['attention_mask'].to(device)
                input_id = item['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                    
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        save_path = f'bert_{epoch_num}.pt'
        torch.save({
            'epoch': epoch_num,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
#             'loss': LOSS,
            }, save_path)    
        print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
        print( 'model saved to =>', save_path)

model = BertClassifier(dropout=0.5)
           
train(model, train_data, val_data, Config.lr, Config.epochs)

Downloading:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1026/1026 [06:25<00:00,  2.66it/s]


Epochs: 1 | Train Loss:  0.006                 | Train Accuracy:  0.964                 | Val Loss:  0.002                 | Val Accuracy:  0.979
model saved to => bert_0.pt


100%|██████████| 1026/1026 [06:24<00:00,  2.67it/s]


Epochs: 2 | Train Loss:  0.001                 | Train Accuracy:  0.989                 | Val Loss:  0.002                 | Val Accuracy:  0.986
model saved to => bert_1.pt


In [None]:
def predict(model, test_data):

    test = PatentTestDataset( 
                    anchor = test_data.anchor.values,
                    target = test_data.target.values,
                    context = test_data.context.values,
                    tokenizer = tokenizer,
                    max_len = Config.max_len
    )

    test_dataloader = DataLoader(test, batch_size=32)

    if torch.cuda.is_available():
        model = model.cuda()
    score_list = []
    with torch.no_grad():

        for item in test_dataloader:

            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            preds = output.argmax(dim=1)
            score_list.append(  preds )

    print( 'test generated =>' , len( score_list))
    return score_list

In [None]:
scores = predict(model, test_df)

In [None]:
scorelist =[]
for item in scores:
    scorelist.append(item.cpu().numpy())
preds = np.hstack(scorelist)
sub_df['preds'] =preds

In [None]:
sub_df.head()

In [None]:
score_map = dict(zip( range(5), ['0.00', '0.25', '0.50', '0.75', '1.00']))
inverse_score_map = dict(zip( [0.00, 0.25, 0.50, 0.75, 1.00],range(5) ))

In [None]:
sub_df['score']= sub_df.preds.astype(float).map(score_map)

In [None]:
import os  
os.makedirs('./kaggle/working', exist_ok=True)  

In [None]:
sub_df[['id', 'score']].to_csv('./submission.csv', index=False)