In [1]:
!pip install -q gdown



In [2]:
!gdown --id 1-5ebQom0-51wMERqfPdUdvEgtAYXMvL1
!gdown --id 1-AJAoivJlmpPoaJDRpMboLdSeyddVT9h
!gdown --id 1-Cf7es8FrkSpXf99ZQzHwM7JhKLS8d0p

Downloading...
From: https://drive.google.com/uc?id=1-5ebQom0-51wMERqfPdUdvEgtAYXMvL1
To: /kaggle/working/train_feat.csv
100%|█████████████████████████████████████████| 139M/139M [00:00<00:00, 243MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-AJAoivJlmpPoaJDRpMboLdSeyddVT9h
To: /kaggle/working/val_feat.csv
100%|███████████████████████████████████████| 23.1M/23.1M [00:00<00:00, 193MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-Cf7es8FrkSpXf99ZQzHwM7JhKLS8d0p
To: /kaggle/working/test_feat.csv
100%|███████████████████████████████████████| 69.2M/69.2M [00:00<00:00, 212MB/s]


In [3]:
ls

__notebook__.ipynb  test_feat.csv  train_feat.csv  val_feat.csv


In [4]:
! pip install -q transformers
! pip install -q datasets
! pip install -q -U transformers tokenizers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.9.1 requires transformers<4.17,>=4.1, but you have transformers 4.17.0 which is incompatible.[0m


In [5]:
import io
import os
import torch
import numpy as np
from functools import partial
from tqdm.notebook import tqdm
from datasets import load_metric, load_dataset, Dataset
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          AutoModel,
                          GPT2Config,
                          EvalPrediction,
                          DataCollatorWithPadding,
                          AutoTokenizer)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

set_seed(123)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'sberbank-ai/rugpt3small_based_on_gpt2'
labels_ids = {'H':0, 'M':1}
N_LABELS = len(labels_ids)

In [6]:
data = load_dataset('csv', data_files={'train':  'train_feat.csv',
                                       'validation': 'val_feat.csv'}).shuffle()

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-90a036d4abb13341/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-90a036d4abb13341/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(set(data['train']['Class'])))

LabelEncoder()

In [8]:
def preprocess_examples(examples, features, tokenizer):
    result = tokenizer(examples['Text'], padding=False)
    feats = []
    for col in features:
        feats.append(examples[col])
    result['features'] = np.array(feats).T
    result["labels"] = le.transform(examples['Class'])
    return result

In [9]:
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, num_labels=N_LABELS)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, config=model_config)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
model_config.pad_token_id = model_config.eos_token_id

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
features = data['train'].column_names[4:]
tokenized_splits = data.map(partial(preprocess_examples, features=features, tokenizer=tokenizer), batched=True, remove_columns=data['train'].column_names)

  0%|          | 0/130 [00:00<?, ?ba/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

In [11]:
tokenized_splits

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'features', 'labels'],
        num_rows: 129066
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'features', 'labels'],
        num_rows: 21511
    })
})

In [12]:
train_dataset = tokenized_splits['train']#.select(range(10000))
eval_dataset = tokenized_splits['validation']#.select(range(1000))

In [13]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

In [14]:
import numpy as np

ACC = load_metric('accuracy')
F1 = load_metric('f1')

def compute_metrics(p: EvalPrediction):
    preds = p.predictions
    preds = np.argmax(preds, axis=1)

    acc = ACC.compute(predictions=preds, references=p.label_ids)['accuracy']
    micro_f1 = F1.compute(predictions=preds, references=p.label_ids,
                              average='micro')['f1']
    macro_f1 = F1.compute(predictions=preds, references=p.label_ids,
                              average='macro')['f1']
    return {'acc': acc, 'micro_f1': micro_f1, 'macro_f1': macro_f1}

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [15]:
from torch import nn


class GPTClassifier(nn.Module):
    def __init__(self, model_config, n_classes):
        super().__init__()
        self.n_classes = n_classes
        # model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, num_labels=N_LABELS)
        # model_config.pad_token_id = model_config.eos_token_id
        self.gpt = AutoModel.from_pretrained(MODEL_NAME, config=model_config)
        self.drop = nn.Dropout(p=0.3)
        self.f1 = nn.Linear(self.gpt.config.hidden_size, self.gpt.config.hidden_size)
        self.out = nn.Linear(self.gpt.config.hidden_size+20, n_classes)
  
    def forward(self, input_ids, attention_mask, features, labels=None):
        last_hidden_state, _ = self.gpt(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False)
        
        batch_size, sequence_length = input_ids.shape[:2]
        sequence_lengths = torch.ne(input_ids, self.gpt.config.pad_token_id).sum(-1) - 1
        pooled_output = self.f1(self.drop(last_hidden_state))

        pooled_output = pooled_output[torch.arange(batch_size), sequence_lengths]
        stacked_features = torch.hstack([pooled_output, features])
        logits = self.out(stacked_features)
       
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.n_classes), labels.view(-1))

        output = (logits,)
        return ((loss,) + output) if loss is not None else output

In [16]:
model = GPTClassifier(model_config, N_LABELS)


training_args = TrainingArguments(
    output_dir=f'checkpoints/gpt', #{run_base_dir}',
    overwrite_output_dir=True,
    do_train=True, do_eval=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    weight_decay=1e-4,
    num_train_epochs=5,
    warmup_ratio=0.1, save_strategy='epoch',
    save_total_limit=1, seed=42, fp16=True,
    dataloader_num_workers=1, group_by_length=True,
    report_to='none', load_best_model_at_end=True, metric_for_best_model='eval_acc',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, #tokenized_splits['train'],
    eval_dataset=eval_dataset, #tokenized_splits['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

train_result = trainer.train()
print('train', train_result.metrics)

dev_predictions = trainer.predict(test_dataset=tokenized_splits['validation'])
print('dev', dev_predictions.metrics)

Downloading:   0%|          | 0.00/526M [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/rugpt3small_based_on_gpt2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using amp half precision backend
***** Running training *****
  Num examples = 129066
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 161335
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Acc,Micro F1,Macro F1
1,0.5196,1.299984,0.689368,0.689368,0.68935
2,0.4786,1.222395,0.717912,0.717912,0.716252
3,0.4459,1.418196,0.724885,0.724885,0.724323
4,0.4783,1.327435,0.72707,0.72707,0.726581
5,0.4856,1.611037,0.728929,0.728929,0.728063


***** Running Evaluation *****
  Num examples = 21511
  Batch size = 4
Saving model checkpoint to checkpoints/gpt/checkpoint-32267
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in checkpoints/gpt/checkpoint-32267/tokenizer_config.json
Special tokens file saved in checkpoints/gpt/checkpoint-32267/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 21511
  Batch size = 4
Saving model checkpoint to checkpoints/gpt/checkpoint-64534
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in checkpoints/gpt/checkpoint-64534/tokenizer_config.json
Special tokens file saved in checkpoints/gpt/checkpoint-64534/special_tokens_map.json
Deleting older checkpoint [checkpoints/gpt/checkpoint-32267] due to args.save_total_limit
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 21511
  Batch size = 4
Saving model checkpoint to checkpoints/gpt/checkp

train {'train_runtime': 15054.7091, 'train_samples_per_second': 42.866, 'train_steps_per_second': 10.717, 'total_flos': 0.0, 'train_loss': 0.6842463758846172, 'epoch': 5.0}


dev {'test_loss': 1.611037254333496, 'test_acc': 0.7289293849658315, 'test_micro_f1': 0.7289293849658314, 'test_macro_f1': 0.7280632529922944, 'test_runtime': 164.601, 'test_samples_per_second': 130.686, 'test_steps_per_second': 32.673}
