In [146]:
%cd ../

d:\CODE\Commonsense\CSQA_dev\CODE


In [151]:
from copy import deepcopy
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AlbertPreTrainedModel, AlbertTokenizer, AlbertConfig

from model.AlbertModel import AlbertModel


In [167]:
class AlbertBurger(nn.Module):

    def __init__(self, config, **kwargs):

        super(AlbertBurger, self).__init__()

        albert1_layers = kwargs['albert1_layers']

        self.config1 = deepcopy(config)
        self.config1.num_hidden_layers = albert1_layers
        self.config2 = deepcopy(config)
        self.config2.num_hidden_layers = config.num_hidden_layers - albert1_layers
        self.config2.without_embedding = True

        self.albert1 = AlbertModel(self.config1)
        self.albert2 = AlbertModel(self.config2)

        self.scorer = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(config.hidden_size, 1)
        )

        self.apply(self.init_weights)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        
        outputs = self.albert1(input_ids, attention_mask, token_type_ids)
        hidden_state_1 = outputs.last_hidden_state
        outputs = self.albert2(inputs_embeds=hidden_state_1)
        return outputs.last_hidden_state


    @staticmethod
    def init_weights(module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()

    @classmethod
    def from_pretrained(cls, model_path_or_name, **kwargs):

        config = AlbertConfig()
        config.without_embedding = False
        if "xxlarge" in model_path_or_name:
            config.hidden_size = 4096
            config.intermediate_size = 16384
            config.num_attention_heads = 64
            config.num_hidden_layers = 12
        elif "xlarge" in model_path_or_name:
            config.hidden_size = 2048
            config.intermediate_size = 8192
            config.num_attention_heads = 16
            config.num_hidden_layers = 24
        elif "large" in model_path_or_name:
            config.hidden_size = 1024
            config.intermediate_size = 4096
            config.num_attention_heads = 16
            config.num_hidden_layers = 24
        elif "base" in model_path_or_name:
            config.hidden_size = 768
            config.intermediate_size = 3072
            config.num_attention_heads = 12
            config.num_hidden_layers = 12

        model = cls(config, **kwargs)
        model.albert1 = model.albert1.from_pretrained(model_path_or_name, config=model.config1)
        model.albert2 = model.albert2.from_pretrained(model_path_or_name, config=model.config2)

        return model


In [170]:
# model = AlbertBurger.from_pretrained(r'D:\CODE\Python\Transformers-Models\albert-base-v2', albert1_run=6)
kwargs = {'albert1_layers': 6}
model = AlbertBurger.from_pretrained(r'D:\CODE\Python\Transformers-Models\albert-base-v2', **kwargs)

Some weights of the model checkpoint at D:\CODE\Python\Transformers-Models\albert-base-v2 were not used when initializing AlbertModel: ['albert.embeddings.word_embeddings.weight', 'albert.embeddings.position_embeddings.weight', 'albert.embeddings.token_type_embeddings.weight', 'albert.embeddings.LayerNorm.weight', 'albert.embeddings.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [165]:
tokenizer = AlbertTokenizer.from_pretrained(r'D:\CODE\Python\Transformers-Models\albert-base-v2')
feature_dict = tokenizer.batch_encode_plus(['just have a test',], return_tensors='pt')
feature_dict

{'input_ids': tensor([[   2,  114,   57,   21, 1289,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [166]:
model(**feature_dict)

tensor([[[ 1.2229,  0.7311,  0.5204,  ..., -0.1884,  0.4021,  0.2687],
         [ 0.1540,  0.5329,  0.7220,  ..., -0.5679, -0.1582,  0.1250],
         [ 0.0161, -0.2441,  0.5613,  ...,  1.0106,  0.7557, -1.1507],
         [ 0.7950,  0.5904,  2.4772,  ...,  0.1583,  1.2289, -0.1720],
         [ 0.8335, -0.4584, -0.2803,  ...,  0.4956,  0.5529, -1.5881],
         [ 0.0646,  0.1395, -0.0524,  ..., -0.0824,  0.1352,  0.2109]]],
       grad_fn=<NativeLayerNormBackward>)