In [None]:
!pip install -q transformers
!pip install transformers datasets

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
# from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import *
from torch.nn.functional import cross_entropy, mse_loss
from sklearn.metrics import ndcg_score
from sklearn.metrics import mean_squared_error

from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup


# Data load

In [4]:
!unzip dataset.zip

In [5]:
df = pd.read_json("ranking_train.jsonl", lines=True)

## Препроцессинг не требуется для предобученных языковых моделей вроде БЕРТа, т.к. он использует всю информацию в предложении, даже пунктуцию и стоп-слова, с различных перспектив, применяя механизм self-attention

## Потому удаление стоп-слов и пунктцации просто удалит контекст, который БЕРТ мог бы использовать для получения лучших результатов

## Разделяем для БЕРТа специальным символом текст и коммент к нему. Переводим в DataFrame

In [6]:
## we need the imput after tokenizer looking like this: [CLS] TEXT [SEP] COMMENT [SEP]
text_comment = []# text with label 4 comment
labels = []
for i in range(len(df)):
  text_comment.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][0]['text']))
  labels.append(0) # super
  text_comment.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][1]['text']))
  labels.append(1) # good
  text_comment.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][2]['text']))
  labels.append(2) # average
  text_comment.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][3]['text']))
  labels.append(3) # poor
  text_comment.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][4]['text']))
  labels.append(4) # bad

In [7]:
text_comment[10], labels[10]

('How Costco Became the Anti-Wal-Mart [SEP] I really hate it when people falsely claim that the fiduciary responsibilities of a public company includes maximizing profits at every opportunity. Thinking long-term and win-win is the truly responsible CEOs job. Costco is the Google of discount warehouses. Wal-mart is Microsoft.',
 0)

In [8]:
data = pd.DataFrame(list(zip(text_comment, labels)),
               columns =['text_with_comment', 'label'])
# data = data.sample(frac=0.5).reset_index(drop=True)

## train/test split

In [9]:
df_train, df_valid = train_test_split(data, test_size = 0.1, random_state=42, stratify=data.label.values)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

# main classes and methods

## Первый вектор БЕРТа отвечает за классификацию предложений в задаче текстового моделирования image.png, затем мы этот вектор размерности 768 отправляем в полносвязный слой с одним выходным нейроном

## будем учить дистиллированного БЕРТа, т.к. это значительно увеличивает скорость обучения, т.к. он имеет на 40% меньше параметров, при этом не сильно проигрывает в перформансе

In [10]:
class VK_BERT(DistilBertPreTrainedModel):
    def __init__(self, config):
        config.num_labels = 1
        super(VK_BERT, self).__init__(config)
        self.bert = DistilBertForSequenceClassification(config)
        self.init_weights()

    def forward(self, ids, mask):
        outputs = self.bert(ids, mask)
        logits = outputs[0]
        return logits
        

In [11]:
class DATALoader:
    def __init__(self, data, target):
        self.data = data
        self.target = target
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') # destilbert doesn't have token_type_ids
        self.truncation=True
        self.max_length=512
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        data = str(self.data[item])
        data = " ".join(data.split())

        inputs = self.tokenizer.encode_plus(data, truncation=self.truncation, padding='longest')
        ids = inputs["input_ids"]
        mask = inputs['attention_mask']

        padding_length = self.max_length - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.long)
        }

In [12]:
def loss_fn(output, targets):
#     return cross_entropy(output, targets)
    return mse_loss(output, targets) 

In [13]:
def train_func(data_loader, model, optimizer, device, scheduler): # can add scheduler
    model.to(device)
    model.train()
    i = 999
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["targets"]
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask = mask,
        )
        output = output.squeeze(1)
        
        loss = loss_fn(output, targets)
        if i % 1000 == 0:
          print('\n running loss: ', loss)
        i += 1
        loss.backward()
        
        optimizer.step()
        scheduler.step()

In [14]:
def eval_func(data_loader, model, device):
    model.eval()
    
    fin_targets = []
    fin_output = []
    
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)


            output = model(
                ids=ids,
                mask = mask,
            )
            output = output.squeeze(1)
        
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_output.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
            
        return fin_output, fin_targets

In [15]:
device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
model = VK_BERT.from_pretrained("distilbert-base-uncased")

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = [
    "bias", 
    "LayerNorm,bias",
    "LayerNorm.weight",
           ]
optimizer_parameters = [
    {'params': [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],
               'weight_decay':0.001},
    {'params': [p for n,p in param_optimizer if any(nd in n for nd in no_decay)],
               'weight_decay':0.0}
]

num_train_steps = int(len(df_train)/ 8*10)

optimizer = AdamW(optimizer_parameters, lr=3e-5)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps

)

In [18]:
print(num_train_steps)

495601


In [19]:
train_dataset = DATALoader(
        data=df_train.text_with_comment.values,
        target=df_train.label.values,
)

train_data_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=32,
        num_workers=4,
)

val_dataset = DATALoader(
        data=df_valid.text_with_comment.values,
        target=df_valid.label.values,
)

val_data_loader = torch.utils.data.DataLoader(
        val_dataset, 
        batch_size=16,
        num_workers=1,
)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_tok

# На train считаем mse, идея кажется хорошей, т.к. не требует отправлять по 5 инпутов, но позволяет понять, насколько близки наши text+comment к лейблам, не привязывая к количеству и конкретным классам. На eval считаем ndcg

In [None]:
best_score = 0
best_epoch = 0
for epoch in range(5):
    train_func(data_loader=train_data_loader, model=model, optimizer=optimizer, scheduler=scheduler, device=device)
    outputs, targets = eval_func(data_loader=train_data_loader, model=model, device=device)
    outputs = np.expand_dims(np.array(outputs), axis=0)
    targets = np.expand_dims(np.array(targets), axis=0)
    ndcg = ndcg_score(targets, outputs)
    print(f"ndcg: {ndcg}")

    if ndcg > best_score:
        best_ndcg = ndcg
        best_epoch = epoch
        torch.save(model.state_dict(), "best_model.bin")    

In [23]:
best_ndcg

0.9703641575225223

In [24]:
best_epoch

4

# заполнения score у ranking_test

In [None]:
df = pd.read_json("ranking_test.jsonl", lines=True) 
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
com = []
#1*5 + 4
for i in range(len(df)):
  com.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][0]['text']))
  com.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][1]['text']))
  com.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][2]['text']))
  com.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][3]['text']))
  com.append("{} [SEP] {}".format(df['text'][i], df['comments'][i][4]['text']))

In [None]:
model.eval()
com_out = []
for i in tqdm(range(len(df))):
  com_out = []
  for j in range(5):
    com_encoded = tokenizer.encode_plus(com[i*5 + j], max_length = 512, pad_to_max_length=True, return_tensors="pt") # .encode_plus(data, max_length = self.max_length, pad_to_max_length=True)
    ids = com_encoded["input_ids"]
    mask = com_encoded["attention_mask"]
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    output = model(ids, mask).squeeze(1)
    com_out.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
  order = (np.argsort(com_out)) 

  for score, idx in enumerate(np.argsort(com_out)):  # сортируем 5-ки комментариев
    df['comments'][i][idx]['score'] = score

In [None]:
output_path = "results/ranking_test.jsonl"

In [None]:
with open(output_path, "w") as f:
    f.write(df.to_json(orient='records', lines=True))