# Lib

In [1]:
import re
import itertools

import torch
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.nn import *
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig

# Config

In [2]:
CFG = dict(dropout=.20,
           batch_size=8,
           max_length=310,
           model_name="../input/robertalarge",
           tokenizer='../input/nbme-roberta-large/roberta-large/tokenizers',
           models_states=['../input/train-robertalarge-0-5-fold-nbme/model.pth',
                          '../input/train-robertalarge-1-5-fold-nbme/model.pth',
                          '../input/train-robertalarge-2-5-fold-nbme/model.pth',
                          '../input/train-robertalarge-3-5-fold-nbme/model.pth',
                          '../input/train-robertalarge-4-5-fold-nbme/model.pth']
)
CFG

{'dropout': 0.2,
 'batch_size': 8,
 'max_length': 310,
 'model_name': '../input/robertalarge',
 'tokenizer': '../input/nbme-roberta-large/roberta-large/tokenizers',
 'models_states': ['../input/train-robertalarge-0-5-fold-nbme/model.pth',
  '../input/train-robertalarge-1-5-fold-nbme/model.pth',
  '../input/train-robertalarge-2-5-fold-nbme/model.pth',
  '../input/train-robertalarge-3-5-fold-nbme/model.pth',
  '../input/train-robertalarge-4-5-fold-nbme/model.pth']}

# Data loading

In [3]:
base_path = "../input/nbme-score-clinical-patient-notes"

patient_notes = pd.read_csv(f"{base_path}/patient_notes.csv")
features = pd.read_csv(f"{base_path}/features.csv")
test_data = pd.read_csv(f"{base_path}/test.csv")

# Preprocessing

In [4]:
test_merge = pd.merge(test_data.merge(patient_notes, on=['case_num', 'pn_num'], how='left'),
                                        features,
                                        on=['feature_num', 'case_num'], how='left'
                                        )

assert test_merge.shape[0] == test_data.shape[0]

print(f"Shape test_merge = {test_merge.shape}")
test_merge.sample(3)

Shape test_merge = (5, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,feature_text
2,00016_002,0,16,2,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
1,00016_001,0,16,1,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
0,00016_000,0,16,0,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...


In [5]:
def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

def clean_spaces(text):
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub('\r', ' ', text)
    return text

In [6]:
test_merge["pn_history"] = test_merge["pn_history"].apply(lambda x: x.strip())
test_merge["feature_text"] = test_merge["feature_text"].apply(process_feature_text)
test_merge["feature_text"] = test_merge["feature_text"].apply(clean_spaces)
test_merge["pn_history"] = test_merge["pn_history"].apply(clean_spaces)

test_merge["feature_text"] = test_merge["feature_text"].apply(lambda x: x.lower())
test_merge["pn_history"] = test_merge["pn_history"].apply(lambda x: x.lower())

# Data Generator

In [7]:
class Data_gen(Dataset):
    def __init__(self, df, transforms_input, tokenizer):
        self.pn_historys = df['pn_history']
        self.feature_text = df['feature_text']

        
        self.transforms_input = transforms_input        
        self.tokenizer = tokenizer
    
        
    def __len__(self):
        return len(self.pn_historys)

    def __getitem__(self, index):
                
        tokens = self.transforms_input(self.tokenizer,
                                  self.pn_historys[index],
                                  self.feature_text[index])
        
        input_ids = np.array(tokens["input_ids"])
        attention_mask = np.array(tokens["attention_mask"])
        token_type_ids = np.array(tokens["token_type_ids"])

        offset_mapping = np.array(tokens['offset_mapping'])
        sequence_ids = np.array(tokens.sequence_ids()).astype("float16")
        
        
        return {'X': {'input_ids': input_ids,
                      'attention_mask': attention_mask,
                      'token_type_ids': token_type_ids,
                      'offset_mapping': offset_mapping,
                      'sequence_ids': sequence_ids},
                }

# Score

In [8]:
def get_location_predictions(preds, offset_mapping, sequence_ids, test=False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = 1 / (1 + np.exp(-pred))
        start_idx = None
        end_idx = None
        current_preds = []
        for pred, offset, seq_id in zip(pred, offsets, seq_ids):
            if seq_id is None or seq_id == 0:
                continue

            if pred > 0.5:
                if start_idx is None:
                    start_idx = offset[0]
                end_idx = offset[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
            
    return all_predictions

# Transform

In [9]:
def transform(tokenizer, pn_historys, feature_text):
    out = tokenizer(
        feature_text,
        pn_historys,
        truncation='only_second',
        max_length=CFG['max_length'],
        padding='max_length',
        return_offsets_mapping=True,
        return_token_type_ids=True
    )
    
    out["sequence_ids"] = out.sequence_ids()
    
    return out

# Tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained(CFG['tokenizer'])

# DataLoader

In [11]:
test_set = Data_gen(test_merge, transform, tokenizer)
test_loader = DataLoader(test_set, batch_size=CFG['batch_size'], shuffle=False)

# Change device

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Model

In [13]:
class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(CFG['model_name']) 
        
        self.dropout = nn.Dropout(p=CFG['dropout'])
        
        self.fc1 = nn.Linear(1024, 1)

        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        logits = self.fc1(self.dropout(outputs[0])).squeeze(-1)
        
        return logits

In [14]:
models = []
for m in CFG["models_states"]:
    
    model = CustomModel()
    model.load_state_dict(torch.load(m, map_location=device)['model_state_dict'])
    model.eval();

    models.append(model)

Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initi

# Get predicts

In [15]:
def inference(models, test_loader, device):
    preds = []
    offsets = []
    seq_ids = []
        
    for batch in tqdm(test_loader):

        input_ids = batch['X']['input_ids'].to(device)
        attention_mask = batch['X']['attention_mask'].to(device)
        token_type_ids = batch['X']['token_type_ids'].to(device)

        offset_mapping = batch['X']['offset_mapping'].to(device)
        sequence_ids = batch['X']['sequence_ids'].to(device)

        offsets.append(offset_mapping.cpu().numpy())
        seq_ids.append(sequence_ids.cpu().numpy())
        
        buffer_preds = []

        for model in models:
            
            model.to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = logits.detach().cpu().numpy()

            buffer_preds.append(logits)
        
        preds_mean = np.mean(buffer_preds, axis=0)
        preds.append(preds_mean)


    preds = np.concatenate(preds, axis=0)
    offsets = np.concatenate(offsets, axis=0)
    seq_ids = np.concatenate(seq_ids, axis=0)
    
    return preds, offsets, seq_ids

In [16]:
predict = inference(models, test_loader, device)

test_data["location"] = get_location_predictions(predict[0], predict[1], predict[2], test=True)
test_data[["id", "location"]].to_csv("submission.csv", index=False)

pd.read_csv("submission.csv").head()

100%|██████████| 1/1 [00:03<00:00,  3.68s/it]


Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217
3,00016_003,70 91; 176 183
4,00016_004,222 258
