# Эксперимент 1

Хотим выучить трансформер на классификацию с CLS токена посередине внутри сильной модели с полным количеством слоев.

Данные отсюда: https://www.kaggle.com/c/nlp-2021-hw1/overview

In [None]:
! pip install transformers --quiet

In [None]:
import re
from warnings import filterwarnings

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm_notebook as tqdm

from torch.utils.data import DataLoader, Dataset

from transformers import AdamW, AutoModel, AutoTokenizer,get_linear_schedule_with_warmup

filterwarnings('ignore')

### Downloading data

### Balancing data

In [None]:
train_df = pd.read_csv('train.tsv.zip', sep='\t')
test_df = pd.read_csv('test.tsv.zip', sep='\t')
valid_df = pd.read_csv('valid.tsv', sep='\t')

train_positive_class_df = train_df[train_df['label'] == 1]
train_negative_class_df = train_df[train_df['label'] == 0]

len(train_positive_class_df), len(train_negative_class_df)

(773, 7411)

In [None]:
num_positive_examples = len(train_positive_class_df)

# For training set, we take the same amount of positive and negative examples
train_negative_class_df = train_negative_class_df.sample(num_positive_examples)
# Concatenating positive and negative examples and shuffling the training set
train_df = pd.concat((train_positive_class_df, train_negative_class_df)).sample(frac=1)

### Preprocessing

Preprocessing is adopted from:

https://github.com/akutuzov/webvectors/blob/master/preprocessing/modular_processing/unify.py

We unify letters to decrease the size of dictionary. We also unify and remove all punctuation.

In [None]:
def list_replace(search, replacement, text):
    '''
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    '''
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace(
        '\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019',
         '\u0022',
          text
    )

    text = list_replace(
        '\u2012\u2013\u2014\u2015\u203E\u0305\u00AF',
         '\u2003\u002D\u002D\u2003',
          text
    )

    text = list_replace(
        '\u2010\u2011',
         '\u002D',
          text
    )

    text = list_replace(
        '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
        '\u2002',
        text
    )

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace(
        '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
        '.',
         text
    )

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?\'\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list(
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
    )

    alphabet = list(
        '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
    )

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text

In [None]:
# Extracting tweet texts
train_tweet_texts = train_df.tweet.values
test_tweet_texts = test_df.tweet.values
valid_tweet_texts = valid_df.tweet.values

# Extracting tweet labels
train_labels = train_df['label'].values
valid_labels = valid_df['label'].values

# Preprocessing training tweets
cleaned_train_texts = []
for tweet_text in train_tweet_texts:
    cleaned_text = clean_text(tweet_text).lower()
    split_cleaned_text = cleaned_text.split()
    cleaned_train_texts.append(' '.join(split_cleaned_text))
    
# Preprocessing test tweets
cleaned_test_texts = []
for tweet_text in test_tweet_texts:
    cleaned_text = clean_text(tweet_text)
    cleaned_test_texts.append(' '.join(cleaned_text.split()))
    
# Preprocessing validation tweets
cleaned_valid_texts = []
for tweet_text in valid_tweet_texts:
    cleaned_text = clean_text(tweet_text)
    cleaned_valid_texts.append(' '.join(cleaned_text.split()))

train_df['clean_text'] = cleaned_train_texts
valid_df['clean_text'] = cleaned_valid_texts
test_df['clean_text'] = cleaned_test_texts

### Model and dataset

In [None]:
PRE_TRAINED_MODEL_NAME = 'cimm-kzn/enrudr-bert'

In [None]:
class TwitterClassifier(nn.Module):
    def __init__(self, n_classes, use_kd=False):
        super(TwitterClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.use_kd = use_kd

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state_cls = outputs[0][:, 0, :]     
        if self.use_kd:
            middle_hidden_state_cls = outputs[2][5][:, 0, :]
            return self.out(self.drop(last_hidden_state_cls)), self.out(self.drop(middle_hidden_state_cls))
        return self.out(self.drop(last_hidden_state_cls))

class TwitterDataset(Dataset):
    def __init__(self, ids, tweets, targets, tokenizer, max_len):
        self.ids = ids
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]
        id = self.ids[item]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'id': id,
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

train_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in cleaned_train_texts]
valid_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in cleaned_valid_texts]
test_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in cleaned_test_texts]

train_max_len = max(map(len, train_tokenized))
valid_max_len = max(map(len, valid_tokenized))
test_max_len = max(map(len, valid_tokenized))

print(train_max_len)
print(valid_max_len)
print(test_max_len)

110
124
124


In [None]:
import torch
torch.manual_seed(0)
import random
random.seed(0)
import numpy as np
np.random.seed(0)

def create_data_loader(df, tokenizer, batch_size, max_len):
    if 'label' in df:
        labels = df.label.values
    else:
        labels = [0] * len(df)
    ds = TwitterDataset(
        ids = df.tweet_id.values,
        tweets= df.clean_text.values,
        targets=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
    )

BATCH_SIZE = 32

train_data_loader = create_data_loader(train_df, tokenizer, BATCH_SIZE, train_max_len)
valid_data_loader = create_data_loader(valid_df, tokenizer, BATCH_SIZE, valid_max_len)
test_data_loader = create_data_loader(test_df, tokenizer, BATCH_SIZE, test_max_len)

### Training

In [None]:
n_classes = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwitterClassifier(n_classes, True)
model = model.to(device)

Some weights of the model checkpoint at cimm-kzn/enrudr-bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device

device(type='cuda')

In [None]:
EPOCHS = 1
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

def loss_fn_kd(outputs, labels, teacher_outputs):
    alpha = 0.3
    T = 1
    KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T) + \
              F.cross_entropy(outputs, labels) * (1. - alpha)

    return KD_loss


def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader, desc='TRAIN'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

def train_epoch_kd(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader, desc='TRAIN'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs_full, outputs_middle = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs_middle, dim=1)
        loss_full = loss_fn(outputs_full, targets)
        kd_loss = loss_fn_kd(outputs_middle, targets, outputs_full)
        loss = 1e-1 * loss_full + kd_loss
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)



In [None]:
@torch.no_grad()
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader, desc='EVALUATION'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

@torch.no_grad()
def eval_model_kd(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader, desc='EVALUATION'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs_full, outputs_middle = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs_middle, dim=1)
        loss_full = loss_fn(outputs_full, targets)
        kd_loss = loss_fn_kd(outputs_middle, targets, outputs_full)
        loss = loss_full + kd_loss
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [1]:


# for epoch in range(1):
#     print(f'Epoch {epoch + 1:2d}/{EPOCHS:2d}')
#     print('-' * 10)

#     train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df))
#     valid_acc, valid_loss = eval_model(model, valid_data_loader, loss_fn, device, len(valid_df))
    
#     print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
#     print(f'Valid loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

In [None]:
for epoch in range(1):
    print(f'Epoch {epoch + 1:2d}/{EPOCHS:2d}')
    print('-' * 10)

    train_acc, train_loss = train_epoch_kd(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df))
    valid_acc, valid_loss = eval_model_kd(model, valid_data_loader, loss_fn, device, len(valid_df))
    
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print(f'Valid loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

Epoch  1/ 1
----------


TRAIN:   0%|          | 0/49 [00:00<?, ?it/s]

EVALUATION:   0%|          | 0/108 [00:00<?, ?it/s]

Train loss 0.4779 accuracy 0.6740
Valid loss 0.8758 accuracy 0.7416


In [None]:
eval_model_kd(model, valid_data_loader, loss_fn, device, len(valid_df))

EVALUATION:   0%|          | 0/108 [00:00<?, ?it/s]

(tensor(0.7728, device='cuda:0', dtype=torch.float64), 0.8758427268928952)

# Эксперимент 2

Линейные слои, внутри которых учим самые влиятельные веса тоже быть сильными.

In [None]:
import torch
from torch import nn
import math

In [None]:
class LinearPrunning(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features))
        print(self.weight.shape)
        self.bias = torch.nn.Parameter(torch.randn(out_features))
        
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.bias, -bound, bound)
        self.scale = 0.5
       
        
    def forward(self, x, scale=None, return_output=False):
        output_full = x @ self.weight.t() + self.bias
        if return_output:
            return output_full
        if scale is not None:
            mangitudes = (self.weight ** 2).sum(0)
            threshhold = torch.kthvalue(mangitudes, int(len(mangitudes) * scale)).values.item()
            strong_idxs = torch.zeros(len(mangitudes))
            strong_idxs[torch.arange(len(mangitudes))[mangitudes > threshhold]] = 1
            output = x @ (self.weight @ torch.diag(strong_idxs)).t() + self.bias
            return output, output_full
        else:
            return output_full, output_full

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import random

class Net(nn.Module):
    def __init__(self, hidden_size=40):
        super(Net, self).__init__()
        # here you construct weights for layers
        self.fc1 = LinearPrunning(784, hidden_size)
        self.fc2 = LinearPrunning(hidden_size, hidden_size)
        self.fc3 = LinearPrunning(hidden_size, 10)
        
    def forward(self, x, use_full=True, scale_all=None):
        if use_full and scale_all is None:
            x = F.relu(self.fc1(x, return_output=True))
            x = F.relu(self.fc2(x, return_output=True))
            x = self.fc3(x, return_output=True)
            # check log_softmax signature
            return F.log_softmax(x, dim=-1)
        else:
            if scale_all is None:
                l_idx = random.randint(1, 2)
                scales = [0.5] * 3
                x, x_full = self.fc1(x, scales[0])
                if scales[0] is not None:
                    prom_fake = x
                    prom_true = x_full.detach()
                x = F.relu(x)
                x, x_full = self.fc2(x, scales[1])
                if scales[1] is not None:
                    prom_fake = x
                    prom_true = x_full.detach()
                x = F.relu(x)
                x, x_full = self.fc3(x, scales[2])
                if scales[2] is not None:
                    prom_fake = x
                    prom_true = x_full.detach()
                return F.log_softmax(x, dim=-1), prom_fake, prom_true
            else:
                x = F.relu(self.fc1(x, return_output=True))
                x = F.relu(self.fc2(x, 0.5)[0])
                x, _ = self.fc3(x, 0.5)
                # check log_softmax signature
                return x

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

def loss_fn_kd(outputs, labels, teacher_outputs, params):
    """
    Compute the knowledge-distillation (KD) loss given outputs, labels.
    "Hyperparameters": temperature and alpha
    NOTE: the KL Divergence for PyTorch comparing the softmaxs of teacher
    and student expects the input tensor to be log probabilities! See Issue #2
    """
    alpha = params['alpha']
    T = params['temperature']
    KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T) + \
              F.cross_entropy(outputs, labels) * (1. - alpha)

    return KD_loss



def train(model, optimizer, dataloader):
    loss_log = []
    model.train()
    for data, target in dataloader:
        # data preparation
        data = data.flatten(1)

        optimizer.zero_grad()
        sub_output, prom_fake, prom_true = model(data, False)
        # loss_2 = ((prom_fake - prom_true) ** 2).mean()

        full_output = model(data, use_full=True)
        kd_loss = loss_fn_kd(sub_output, target, full_output,{"alpha" : 0.95, "temperature" : 6})
        teacher_loss = F.cross_entropy(full_output, target)
        
        
        total_loss = kd_loss + 1e-2 * teacher_loss
            # compute gradients
        l1_lambda = 1e-5
        l1_norm = sum(p.abs().sum() for p in model.parameters())

        total_loss = total_loss + l1_lambda * l1_norm
        total_loss.backward()
        loss_log.append([teacher_loss, kd_loss, total_loss])
        optimizer.step()
    return loss_log



def train_normal(model, optimizer, dataloader):
    loss_log = []
    model.train()
    for data, target in dataloader:
        # data preparation
        data = data.flatten(1)
        

        optimizer.zero_grad()

        output = model(data, use_full=True)
        loss = F.nll_loss(output, target)
        total_loss = loss
            # compute gradients
        l1_lambda = 1e-5
        l1_norm = sum(p.abs().sum() for p in model.parameters())

        total_loss = total_loss + l1_lambda * l1_norm
        total_loss.backward()
        # make a step
        optimizer.step()
    return loss_log

# TODO: написать функцию для валидации по X_val, y_val
# # hint: optimizer не нужен
# def test(model):
#     loss_log = []
#     model.eval()
#     <your code>
    
#     return loss_log

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

# Download and load the training data
trainset = datasets.FashionMNIST('./data', download=True, train=True, transform=transform)
train_loader = DataLoader(trainset, batch_size=32, shuffle=True)

# Download and load the test data
validationset = datasets.FashionMNIST('./data', download=True, train=False, transform=transform)
val_loader = DataLoader(validationset, batch_size=32, shuffle=True)

# plt.figure(figsize=[6, 6])
# for i in range(4):
#     plt.subplot(2, 2, i + 1)
#     plt.title("Label: %i" % y_train[i])
#     plt.imshow(X_train[i].reshape([28, 28]), cmap='gray');

In [None]:
train_log = []
val_log = []

model = Net(10)
opt = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(60):
    print(epoch)
    train_loss = train(model, opt, train_loader)
    train_log.extend(train_loss)
    s_scale = 0
    s_full = 0
    c = 0
    for x, y in val_loader:
        x = x.flatten(1)
        s_full +=(model(x).argmax(1) == y).sum()
        s_scale +=(model(x, scale_all=True).argmax(1) == y).sum()
        c += y.shape[0]
    
    print("FULL ACCURACY: ", s_full / c)
    print("SCALED ACCURACY: ", s_scale / c)