In [1]:

import torch
import warnings
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AutoTokenizer
from tqdm import tqdm
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Config:
    seed = 42
    max_len = 314
    epochs = 10
    n_tokens = 20
    learning_rate = 1e-5
    n_splits = 8
    # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

MODEL_NAME='bert-base-uncased'
path_to_trained_model = 'pytorch_model.bin'

In [3]:
eval_model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
eval_model.load_state_dict(torch.load(path_to_trained_model, map_location=Config.device))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [8]:
class EvalDataset(Dataset):
    def __init__(self, sentences, tokenizer, ids):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.ids = ids
        self.n_tokens = Config.n_tokens

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=Config.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

        encoding['input_ids'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 500).resize(20), 
                torch.tensor(encoding['input_ids'], dtype=torch.long)
            )
        )
        encoding['attention_mask'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 1).resize(20), 
                torch.tensor(encoding['attention_mask'], dtype=torch.long)
            )
        )

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'ids': self.ids[idx]
        
        }
    


In [6]:
def predict(df, model):
    dataset = EvalDataset(
        sentences=df.excerpt.values,
        tokenizer=tokenizer,
        ids=df.id.values
    )
    data_loader = DataLoader(
        dataset,
        pin_memory=True
    )

    model.to(Config.device)
    model.eval()
    preds = []
    with torch.no_grad():
        for data in tqdm(data_loader, total=len(data_loader)):
            input_ids = data['input_ids'].to(Config.device)
            attention_mask = data['attention_mask'].to(Config.device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.view(-1).cpu().numpy())
    
    return preds

In [9]:
df_test = pd.read_csv('data/test.csv')

predicts = predict(df_test, eval_model)
df_test['target'] = predicts
df_test[['id', 'target']].to_csv('submission_.csv', index=False)

100%|██████████| 7/7 [00:01<00:00,  6.80it/s]
