In [1]:
import random
import os
import numpy as np
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(42)

In [17]:
import torch
from torch import nn
from transformers import AutoModel, AutoConfig
from pooling import *

class CustomModel(nn.Module):
    def __init__(self, model, pooling_type, hidden_size=None, config_path=None, pretrained=False):
        super().__init__()
        self.pooling_type = pooling_type
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            self.config.hidden_size = 384
        else:
            self.config = torch.load(config_path)
            
        if pretrained:
            self.backbone = AutoModel.from_pretrained(model, config=self.config)
        else:
            self.backbone = AutoModel.from_config(self.config)
        
        if pooling_type == 'MeanPooling':
            self.pool = MeanPooling()
        elif pooling_type == 'WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
        elif pooling_type == 'LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
        else:
            raise ValueError('Unknown pooling type')
        
        
        if pooling_type == 'GRUPooling':
            self.fc = nn.Linear(hidden_size, 6)
        elif pooling_type == 'LSTMPooling':
            self.fc = nn.Linear(hidden_size, 6)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 6)
        
    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        
        last_hidden_states = outputs[0]
        
        if self.pooling_type == 'MeanPooling':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.pooling_type == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.pooling_type in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        else:
            raise ValueError('Unknown pooling type')
        
        return outputs, feature

    def forward(self, inputs):
        original_outputs, feature = self.feature(inputs)
        output = self.fc(feature)
        return feature, original_outputs, output

In [18]:
m1 = CustomModel("microsoft/deberta-v3-large", "MeanPooling", hidden_size=None, config_path="../../../input/model23/config.pth", pretrained=False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

In [19]:
m1.config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [59]:
class MultipleNegativesRankingLoss(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, embeddings_a, embeddings_b, labels=None):
        """
        Compute similarity between a and b.
        Labels have the index of the row number at each row. 
        This indicates that a_i and b_j have high similarity 
        when i==j and low similarity when i!=j.
        """

        similarity_scores = (
            cos_sim(embeddings_a, embeddings_b) * 20.0
        )  # Not too sure why to scale it by 20: https://github.com/UKPLab/sentence-transformers/blob/b86eec31cf0a102ad786ba1ff31bfeb4998d3ca5/sentence_transformers/losses/MultipleNegativesRankingLoss.py#L57

        print(similarity_scores)
        
        labels = torch.tensor(
            range(len(similarity_scores)),
            dtype=torch.long,
            device=similarity_scores.device,
        )  # Example a[i] should match with b[i]

        print(labels)
        
        return self.loss_function(similarity_scores, labels)

In [60]:
loss = MultipleNegativesRankingLoss()

In [56]:
loss.loss_function

CrossEntropyLoss()

In [57]:
def cos_sim(a, b):
    # From https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L31
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [20]:
m1.fc = nn.Linear(in_features=384, out_features=1, bias=True)

In [21]:
m1.config.hidden_size

384

In [9]:
a = torch.load("../../../input/pseudo_label/out_features_m1.pt")

In [10]:
a.shape

torch.Size([615170, 384])

In [61]:
loss(f.view(-1), a[100])

tensor([[0.3803]])
tensor([0])


tensor(0.)

In [67]:
f.view(-1)
ff = torch.concat([f, f])

In [69]:
loss(ff, a[:2])

tensor([[ 0.5083, -0.2483],
        [ 0.5083, -0.2483]])
tensor([0, 1])


tensor(0.7631)

In [70]:
b = torch.load("../../../input/pseudo_label/preds_m1.pt")

In [86]:
(torch.sigmoid(b) > 0.1).sum()

tensor(36527)

In [82]:
c = torch.load("../../../input/pseudo_label/preds_m2.pt")
(torch.sigmoid(c) > 0.3).sum()

tensor(62415)

In [88]:
torch.logical_or(b, c).to(torch.float)

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])

In [89]:
d = torch.load("../../../input/pseudo_label/out_features_m1.pt")

In [90]:
d.shape

torch.Size([615170, 384])

In [91]:
d[0]

tensor([-3.9392e-01, -3.6166e-01, -3.7874e-01,  1.0420e-01,  2.7154e-01,
        -2.9612e-01, -5.7274e-01,  5.7165e-01,  7.1899e-01, -5.6558e-01,
        -4.0695e-01,  9.0580e-01,  5.7599e-01,  4.0355e-01,  3.2290e-01,
        -9.8855e-02, -6.9699e-01, -3.8754e-01,  2.5195e-01,  4.9964e-01,
         7.5651e-02,  6.5373e-02,  2.4497e-01, -5.8589e-01, -9.2474e-01,
        -6.5252e-01, -7.8153e-02, -5.0431e-01,  3.4163e-01,  3.3922e-01,
         2.5057e-01,  6.3353e-01,  3.5714e-01, -2.9381e-01,  2.7550e-01,
        -3.6158e-02,  1.9560e-01,  6.8414e-02, -6.3210e-01,  2.2098e-01,
         3.1501e-01,  7.5646e-02,  3.8383e-01,  8.2166e-01, -2.8584e-01,
        -2.9226e-01, -7.1935e-01, -4.1117e-01,  4.8832e-01,  9.3107e-02,
         1.8719e-01,  9.3699e-02,  1.3574e-01,  3.4655e-01,  3.3137e-01,
        -2.9463e-01,  1.7813e-01, -3.2290e-01, -3.1104e-01, -7.0018e-01,
        -6.6927e-01, -5.0627e-01, -4.9122e-01, -1.4243e-01,  3.5948e-01,
         4.7512e-01,  2.0355e-01,  4.2848e-01,  2.6

In [75]:
import pandas as pd
df = pd.read_csv("../../../input/prep_cleaned_train_context_5fold.csv", lineterminator="\n")

In [76]:
df.columns

Index(['topics_ids', 'content_ids', 'channel', 'topic_title',
       'topic_description', 'topic_parent_title', 'topic_parent_description',
       'topic_child_title', 'topic_child_description', 'topic_category',
       'topic_language', 'content_title', 'content_description',
       'content_text', 'content_kind', 'content_language', 'target',
       'topic_fold', 'content_fold', 'text'],
      dtype='object')

In [78]:
df.target.value_counts()

0    555239
1     59931
Name: target, dtype: int64

In [22]:
x = ({'input_ids': torch.Tensor([[    0, 48962, 14602,   959, 32316,  1065,   294, 21290,   268, 16734,
         17991, 14602,   959, 32316,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]]).to(torch.int32), 'attention_mask': torch.Tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]).to(torch.int32)},
 torch.Tensor([0.]))

In [23]:
x

({'input_ids': tensor([[    0, 48962, 14602,   959, 32316,  1065,   294, 21290,   268, 16734,
           17991, 14602,   959, 32316,     2,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               

In [24]:
_ = m1.eval()

with torch.no_grad():
    f, o, z = m1(x[0])

In [None]:
f.shape, o.last_hidden_state.shape, z.shape

(torch.Size([1, 384]), torch.Size([1, 172, 384]), torch.Size([1, 1]))

In [13]:
z

tensor([[0.2489]])

In [8]:
[i for i in dir(o) if "__" not in i]

['attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'last_hidden_state',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [9]:
o[0]

tensor([[[ 0.0628, -0.6051, -0.4633,  ..., -1.6161,  0.3332, -0.7899],
         [-0.1748, -0.9188, -0.5154,  ..., -1.0968,  0.7967, -0.9400],
         [-0.0848, -1.0519, -0.4905,  ..., -1.3074,  0.2581, -0.5332],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

In [10]:
feature = m1.pool(o[0], x[0]['attention_mask'])

In [11]:
feature.shape

torch.Size([1, 1024])

In [31]:
loss(feature)

TypeError: forward() missing 2 required positional arguments: 'embeddings_a' and 'embeddings_b'

In [57]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig
from torch.utils.checkpoint import checkpoint
from utils import get_backbone_config

def get_last_hidden_state(backbone_outputs):
    last_hidden_state = backbone_outputs[0]
    return last_hidden_state


def get_all_hidden_states(backbone_outputs):
    all_hidden_states = torch.stack(backbone_outputs[1])
    return all_hidden_states


def get_input_ids(inputs):
    return inputs['input_ids']


def get_attention_mask(inputs):
    return inputs['attention_mask']

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        self.output_dim = 1024

    def forward(self, inputs, backbone_outputs):  # x, o
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_outputs)

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class custom_model(nn.Module):
    def __init__(self, tokenizer, backbone_type):
        super().__init__()
        self.tokenizer = tokenizer
        
        if True:
            self.backbone_config = get_backbone_config(backbone_type)
            self.backbone = AutoModel.from_pretrained(backbone_type, config=self.backbone_config)
        else:
            self.backbone = AutoModel.from_config(self.backbone_config)

        # What is this?
        self.backbone.resize_token_embeddings(len(self.tokenizer))
        
        a = len(self.tokenizer) == self.backbone_config.vocab_size
        print(f"len tokenizer vs backbone config vocab size: {a}")
        
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.pool.output_dim, 6)

#         self._init_weights(self.fc)
        
    def forward(self, inputs):
        outputs = self.backbone(**inputs)
        
        feature = self.pool(inputs, outputs)
        output = self.fc(feature)
        return output

In [9]:
from transformers import AutoTokenizer

def get_additional_special_tokens():
    special_tokens_replacement = {
        '\n': '[BR]',
        'Generic_School': '[GENERIC_SCHOOL]',
        'Generic_school': '[GENERIC_SCHOOL]',
        'SCHOOL_NAME': '[SCHOOL_NAME]',
        'STUDENT_NAME': '[STUDENT_NAME]',
        'Generic_Name': '[GENERIC_NAME]',
        'Genric_Name': '[GENERIC_NAME]',
        'Generic_City': '[GENERIC_CITY]',
        'LOCATION_NAME': '[LOCATION_NAME]',
        'HOTEL_NAME': '[HOTEL_NAME]',
        'LANGUAGE_NAME': '[LANGUAGE_NAME]',
        'PROPER_NAME': '[PROPER_NAME]',
        'OTHER_NAME': '[OTHER_NAME]',
        'PROEPR_NAME': '[PROPER_NAME]',
        'RESTAURANT_NAME': '[RESTAURANT_NAME]',
        'STORE_NAME': '[STORE_NAME]',
        'TEACHER_NAME': '[TEACHER_NAME]',
    }
    return special_tokens_replacement

special_tokens_replacement = get_additional_special_tokens()
all_special_tokens = list(special_tokens_replacement.values())

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large",
                                      use_fast=True,
                                      additional_special_tokens=all_special_tokens)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
special_tokens_replacement = {
    '\n': '[BR]',
    'Generic_School': '[GENERIC_SCHOOL]',
    'Generic_school': '[GENERIC_SCHOOL]',
    'SCHOOL_NAME': '[SCHOOL_NAME]',
    'STUDENT_NAME': '[STUDENT_NAME]',
    'Generic_Name': '[GENERIC_NAME]',
    'Genric_Name': '[GENERIC_NAME]',
    'Generic_City': '[GENERIC_CITY]',
    'LOCATION_NAME': '[LOCATION_NAME]',
    'HOTEL_NAME': '[HOTEL_NAME]',
    'LANGUAGE_NAME': '[LANGUAGE_NAME]',
    'PROPER_NAME': '[PROPER_NAME]',
    'OTHER_NAME': '[OTHER_NAME]',
    'PROEPR_NAME': '[PROPER_NAME]',
    'RESTAURANT_NAME': '[RESTAURANT_NAME]',
    'STORE_NAME': '[STORE_NAME]',
    'TEACHER_NAME': '[TEACHER_NAME]',
}

In [3]:
len((list(special_tokens_replacement.values())))

17

In [14]:
tokenizer

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[BR]', '[GENERIC_SCHOOL]', '[GENERIC_SCHOOL]', '[SCHOOL_NAME]', '[STUDENT_NAME]', '[GENERIC_NAME]', '[GENERIC_NAME]', '[GENERIC_CITY]', '[LOCATION_NAME]', '[HOTEL_NAME]', '[LANGUAGE_NAME]', '[PROPER_NAME]', '[OTHER_NAME]', '[PROPER_NAME]', '[RESTAURANT_NAME]', '[STORE_NAME]', '[TEACHER_NAME]']})

In [58]:
m2 = custom_model(tokenizer, "microsoft/deberta-v3-large")

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


len tokenizer vs backbone config vocab size: True


In [56]:
m2.backbone_config.vocab_size

128015

In [32]:
t = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large",
                                      use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
t

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [34]:
len(tokenizer)

128015

In [35]:
len(t)

128001

In [23]:
m2.load_state_dict(m1.state_dict())

<All keys matched successfully>

In [24]:
_ = m2.eval()
with torch.no_grad():
    z = m2(x[0])

In [25]:
z

tensor([[ 0.0079,  0.0859, -0.3627, -1.0221,  0.6245,  0.0832]])

tensor([[ 0.0079,  0.0859, -0.3627, -1.0221,  0.6245,  0.0832]])


In [29]:
feature

tensor([[ 0.0575, -1.1382, -0.3632,  ..., -1.5897,  0.6908, -0.3918]])

m1.pool(o, x[0]['attention_mask'])

In [38]:
feature_ = m2.pool(x[0], o)

In [41]:
(feature_ == feature).sum()

tensor(1024)

In [59]:
config = AutoConfig.from_pretrained("microsoft/deberta-v3-large", output_hidden_states=True)

In [60]:
config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [68]:
m1.config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128015
}

In [69]:
m3 = AutoModel.from_pretrained("microsoft/deberta-v3-large", config=config)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
t_ = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", config=config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [80]:
t_

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [81]:
len(t_)

128001

In [6]:
from transformers import AutoModel, AutoTokenizer
m4 = AutoModel.from_pretrained("microsoft/deberta-v3-large")
t__ = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

In [83]:
m4.config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [84]:
t__

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [85]:
len(t__)

128001

In [8]:
t__.additional_special_tokens

[]

In [10]:
tokenizer.additional_special_tokens

['[BR]',
 '[GENERIC_SCHOOL]',
 '[GENERIC_SCHOOL]',
 '[SCHOOL_NAME]',
 '[STUDENT_NAME]',
 '[GENERIC_NAME]',
 '[GENERIC_NAME]',
 '[GENERIC_CITY]',
 '[LOCATION_NAME]',
 '[HOTEL_NAME]',
 '[LANGUAGE_NAME]',
 '[PROPER_NAME]',
 '[OTHER_NAME]',
 '[PROPER_NAME]',
 '[RESTAURANT_NAME]',
 '[STORE_NAME]',
 '[TEACHER_NAME]']

In [13]:
from transformers import T5Tokenizer

tmp = T5Tokenizer.from_pretrained("t5-small", extra_ids=0, additional_special_tokens=["new_token_1"])


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
tmp

T5Tokenizer(name_or_path='t5-small', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['new_token_1']})

In [15]:
tmp.additional_special_tokens

['new_token_1']

In [16]:
tmp.added_tokens_encoder

{'new_token_1': 32000}

In [21]:
tokenizer = tmp
text = "this is a text with new_token_1, new_token_2 and new_token_3 "

print(tokenizer.additional_special_tokens)
print(tokenizer.added_tokens_encoder)
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
print("***")

tokenizer.add_special_tokens({"additional_special_tokens": ["new_token_2"]})
print(tokenizer.additional_special_tokens)
print(tokenizer.added_tokens_encoder)
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
print("***")

tokenizer.add_special_tokens({"additional_special_tokens": ["new_token_3"]})
print(tokenizer.additional_special_tokens)
print(tokenizer.added_tokens_encoder)
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))

['new_token_1']
{'new_token_1': 32000}
['▁this', '▁is', '▁', 'a', '▁text', '▁with', 'new_token_1', '▁', ',', '▁new', '_', 'to', 'ken', '_', '2', '▁and', '▁new', '_', 'to', 'ken', '_', '3', '</s>']
***
['new_token_2']
{'new_token_1': 32000, 'new_token_2': 32001}
['▁this', '▁is', '▁', 'a', '▁text', '▁with', 'new_token_1', '▁', ',', 'new_token_2', '▁and', '▁new', '_', 'to', 'ken', '_', '3', '</s>']
***
['new_token_3']
{'new_token_1': 32000, 'new_token_2': 32001, 'new_token_3': 32002}
['▁this', '▁is', '▁', 'a', '▁text', '▁with', 'new_token_1', '▁', ',', 'new_token_2', '▁and', 'new_token_3', '</s>']


In [22]:
len(tokenizer)

32003

In [23]:
tokenizer.vocab_size

32000

In [24]:
tokenizer.additional_special_tokens

['new_token_3']

In [25]:
tokenizer.added_tokens_encoder

{'new_token_1': 32000, 'new_token_2': 32001, 'new_token_3': 32002}