In [2]:
import transformers
import torch

In [53]:
import torch

class PreTrainedModelClsPooling(torch.nn.Module):
    def forward(self, input_embeddings):
        pre_trained_last_hidden_state = input_embeddings[0]
        return pre_trained_last_hidden_state[:, 0] ## cls embedding is first index for all batch data embedding

class SentenceThemeTypeDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, onehot = True):
        self.data_df = df
        self.data_sentences = self.data_df["sentences"]
        if not onehot:
            ## integer output labeling, but same same? idk need test, but this integer isnt meant for ordinal anyway, the NN still one_hot-like, not single node output and all
            self.data_labels = self.data_df["cat_label_index"]
        else:
            ## can also try label with one hot encoding!! like a list or tensor of [0,0,1,0] then output is say [0.123,0.456,0.777,0.890], then back propagation and loss function to adjust to fit [0,0,1,0], maybe one hot encoding can work like that!!
            self.data_labels = self.data_df["one_hot_labeling"]
        self.tokenizer = tokenizer
        self.len = len(self.data_df)
        self.max_len = max_len
    def __len__(self):
        return self.len
    def __getitem__(self, index):
        sentence = self.data_sentences[index]
        sentence_tokenized = self.tokenizer.encode_plus(
            sentence,
            return_tensors="pt",
            max_length = self.max_len,
            ## pad_to_max_length = True ## depreciated so should not use
            padding = "max_length", ## or True, longest, etc as padding value
            return_attention_mask = True,
            
            truncation=True,
            add_special_tokens=True
        )
        sentence_label = self.data_labels[index]
        
        ## torch.tensor() along with dtype etc is likely needed but above got 'return_tensors="pt"', so saved there, but output also(?), as in target and all? need check to ensure!! then fix accordingly!! also training using torch stuff so default all use torch data types and all!!, for consistency too!! 
        return {
            "input_ids": sentence_tokenized["input_ids"][0], ## the indexing since 'return_tensors="pt"', then already have shape [1, 256]
            "attention_mask": torch.tensor(sentence_tokenized["attention_mask"].tolist()[0], dtype=torch.long), ## indexing and tolist(), is just to mimick what tokenizer returns if it isnt 'return_tensors="pt"', then default is return list, and 1d(if only one string of text, not a list of them), so list will be 1d of len = 256, no batch size etc!!, and if list of text, then will 2d list, with list of lists(outputs in inner list), then torch.tensor() and dtype=torch.long is to convert the list to tensor and torch.long since tokenizer outputs are integers for attention mask (and input_ids)
            
            #"data_label": sentence_label ## need torch tensor it? or type need to be tensor? need to be torch also?, especially for output?, isit comparison for loss function etc need?
            "data_label": torch.tensor(sentence_label, dtype=torch.float) ## torch.long not ideal since clasification means loss function is compare with decimal float stuff, so long vs float cannot so need consistent float both!!
        } ## maybe data label give separate then see if can pass whole dict to output or drop data label from dict
       

class SentenceThemeTypeModel(torch.nn.Module):
    def __init__(self, external_pretrained_model=None, max_len=0, num_labels=0, cls_only = False, torch_nn_sequential_kwargs_possible = False, load_mod=False):
        super().__init__()
        if load_mod:
            return
        if external_pretrained_model == None or max_len == 0 or num_labels == 0:
            raise Exception("Something not fille!!")
            return
        
        self.cls_only = cls_only
        self.torch_nn_sequential_kwargs_possible = torch_nn_sequential_kwargs_possible
        
        ## its torch.nn.Flatten(), to define an object instance of the torch.nn.Flatten class
        ## not the torch.flatten(*params)
        ## Flatten() object, then thus used, only flattens the dimension 1 to -1, thus keeping dimension 0, which is like the batch axis constant, unchanged
        ## all above is default unless explicitly changed or modified params
        self.flatten = torch.nn.Flatten() ## if need..., but text is 1d?, but at the same time, if tokens change then nodes change? as in number of nodes...
        ## unless like disregard the number of tokens embedding, disregard the word embedding, just take one, like cls then fix so then always only 768 or whatever nodes isit?
        ## unless cos padding so token size always fixed so can flatten? but idk if good or not??
        ##!!
        
        self.pre_trained_model_cls_pooling = PreTrainedModelClsPooling()
        
        self.external_pretrained_model = external_pretrained_model
        self.post_post_model_layer1 = torch.nn.Linear(768, 768)
        self.post_post_model_layer2 = torch.nn.Linear(768, 768)
        self.post_post_model_layer3 = torch.nn.Linear(768, num_labels)
        self.dropout = torch.nn.Dropout(0.2)
        
        ## if kwargs_possible then can pass the kwargs into torch.nn.Sequential and pass to pretrained model, but if not then cannot
        ## if kwargs cant pass into torch.nn.Sequential, then need to remove that layer and separate it first then inside the forward function!!
        
        if not self.cls_only:
            if self.torch_nn_sequential_kwargs_possible:
                ## flatten
                self.flatten_post_model_layer = torch.nn.Linear((max_len*768), 768)
                self.flatten_NN_stack_sequence = torch.nn.Sequential(
                    self.external_pretrained_model,
                    self.flatten,
                    self.flatten_post_model_layer,
                    torch.nn.ReLU(),
                    self.post_post_model_layer1,
                    torch.nn.ReLU(),
                    self.dropout,
                    self.post_post_model_layer2,
                    torch.nn.ReLU(),
                    self.post_post_model_layer3
                )
            else:
                ## flatten
                self.flatten_post_model_layer = torch.nn.Linear((max_len*768), 768)
                self.flatten_NN_stack_sequence = torch.nn.Sequential(
                    self.flatten,
                    self.flatten_post_model_layer,
                    torch.nn.ReLU(),
                    self.post_post_model_layer1,
                    torch.nn.ReLU(),
                    self.dropout,
                    self.post_post_model_layer2,
                    torch.nn.ReLU(),
                    self.post_post_model_layer3
                )
        else:
            if self.torch_nn_sequential_kwargs_possible:
                ## no flatten, cls only
                self.cls_post_model_layer = torch.nn.Linear(768, 768)
                self.cls_NN_stack_sequence = torch.nn.Sequential(
                    self.external_pretrained_model,
                    self.pre_trained_model_cls_pooling,
                    self.cls_post_model_layer,
                    torch.nn.ReLU(),
                    self.post_post_model_layer1,
                    torch.nn.ReLU(),
                    self.dropout,
                    self.post_post_model_layer2,
                    torch.nn.ReLU(),
                    self.post_post_model_layer3
                )
            else:
                ## no flatten, cls only
                self.cls_post_model_layer = torch.nn.Linear(768, 768)
                self.cls_NN_stack_sequence = torch.nn.Sequential(
                    self.pre_trained_model_cls_pooling,
                    self.cls_post_model_layer,
                    torch.nn.ReLU(),
                    self.post_post_model_layer1,
                    torch.nn.ReLU(),
                    self.dropout,
                    self.post_post_model_layer2,
                    torch.nn.ReLU(),
                    self.post_post_model_layer3
                )
    
    def forward(self, **model_input_args): ## ** or *, and also how... cos ** need labeled right? hmm?, or drop the label when dataset out?
        if not self.torch_nn_sequential_kwargs_possible:
            # see if can just use sequence or if need do step by step for pretrained area
            external_pretrained_model_output = self.external_pretrained_model(**model_input_args)
            if not self.cls_only:
                ## flatten
                logits = self.flatten_NN_stack_sequence(external_pretrained_model_output)
            else:
                ## no flatten, cls only
                logits = self.cls_NN_stack_sequence(external_pretrained_model_output)

        else:
            if not self.cls_only:
                ## flatten
                logits = self.flatten_NN_stack_sequence(**model_input_args)
            else:
                ## no flatten, cls only
                logits = self.cls_NN_stack_sequence(**model_input_args)
        return logits
        

In [5]:

distil_model = transformers.DistilBertModel.from_pretrained('distilbert/distilbert-base-uncased')
distil_tokeniser = transformers.DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

In [7]:
# def __init__(self, external_pretrained_model, max_len, num_labels, cls_only = False, torch_nn_sequential_kwargs_possible = False):

the_model = SentenceThemeTypeModel(distil_model, 256, 8)

In [63]:
the_model = torch.load("./model_FF_trained/pytorch_model.bin")
## error since dk other config stuff about tokenizer, only know vocab, thats why distilbert load tokenizer, from_pretrained() can
the_tokenizer = transformers.AutoTokenizer.from_pretrained("./model_FF_trained/tokenizer_vocab.bin")

OSError: Incorrect path_or_model_id: './model_FF_trained/tokenizer_vocab.bin'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [16]:
the_model = torch.load("./model_FF_trained/pytorch_model.bin")
the_tokenizer = transformers.DistilBertTokenizer.from_pretrained("./model_FF_trained/tokenizer_vocab.bin")



In [17]:
## save_vocabulary(), saves only the vocabulary file of the tokenizer (List of BPE tokens).
## To save the entire tokenizer, you should use save_pretrained()
# And should be an entire directory i guess?

In [19]:
the_model(**the_tokenizer("Hi there!!", return_tensors="pt"))

tensor([[-0.9864, -2.5324, -1.8298, -0.9164, -1.4597, -2.4646,  7.6528, -1.4592]],
       grad_fn=<AddmmBackward0>)

In [21]:
old_dict = {'positive': 0, 'negative': 1, 'neutral': 2, 'chaotic': 3, 'gibberish': 4, 'true_random_gibberish': 5, 'uwu': 6, 'leet': 7}
new_dict = {v:k for k,v in old_dict.items()}

In [44]:
new_dict[torch.max(the_model(**the_tokenizer("", return_tensors="pt")), dim=1).indices.item()]

'true_random_gibberish'

In [46]:
the_model(**the_tokenizer("Radiating warmth and kindness", return_tensors="pt"))

tensor([[10.8399, -0.0347,  0.0516, -3.3037, -1.9892, -2.7872, -2.4891, -2.1345]],
       grad_fn=<AddmmBackward0>)

In [47]:
the_model_2 = SentenceThemeTypeModel(distil_model, 2, 8)
the_model_2 = torch.load("./model_FF_trained/pytorch_model.bin")

In [48]:
the_model_2(**the_tokenizer("Radiating warmth and kindness", return_tensors="pt"))

tensor([[10.8399, -0.0347,  0.0516, -3.3037, -1.9892, -2.7872, -2.4891, -2.1345]],
       grad_fn=<AddmmBackward0>)

In [51]:
the_model_3 = SentenceThemeTypeModel(distil_model, 2, 8, cls_only=True)

In [52]:
the_model_3(**the_tokenizer("Radiating warmth and kindness", return_tensors="pt"))

tensor([[-0.0449, -0.0159, -0.0039, -0.0230, -0.0023,  0.0185,  0.0290,  0.0188]],
       grad_fn=<AddmmBackward0>)

In [None]:
## ABOVE MODELS ARE BASED OFF ORIGINAL MODEL DEFINED CLASS AND INITIALISATION
## below is added the kwargs so can just do "load_mod=True" so when need make a model instance, can just no need all those params, since going to be overwritten by loaded in model!!

In [58]:
the_model_4 = SentenceThemeTypeModel(load_mod=True)
the_model_4 = torch.load("./model_FF_trained/pytorch_model.bin")

In [59]:
the_model_4(**the_tokenizer("Radiating warmth and kindness", return_tensors="pt"))

tensor([[10.8399, -0.0347,  0.0516, -3.3037, -1.9892, -2.7872, -2.4891, -2.1345]],
       grad_fn=<AddmmBackward0>)