<a href="https://colab.research.google.com/github/bartoszkozakiewicz/DeepLearning/blob/main/trainingFunc_for_autocomplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset

In [None]:
!pip install transformers>=4.33.1
!pip install   torch==2.0.1
!pip install  torchaudio==2.0.2
!pip install  torchvision==0.15.2
!pip install  accelerate==0.23.0
!pip install   bitsandbytes==0.41.1
!pip install   scipy==1.11.2

In [3]:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

from torch.utils.data import Dataset, DataLoader

class AutofillDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        label = self.labels[idx]

        return {
            "input_ids": input_id,
            "attention_mask": attention_mask,
            "labels": label
        }

def trainingProcedure(autofill_dataset,eval_dataset,model,batch_size=4):
  #Prepare dependencies for training
  training_args = TrainingArguments(
    per_device_train_batch_size=1,
    output_dir="./results",
    evaluation_strategy="steps",
    # eval_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    # fp16=True,
    # per_device_train_batch_size=8,
  )

  #Training process
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=autofill_dataset, #autofill_dataset
    eval_dataset=eval_dataset
  )

  trainer.train()

In [4]:
import transformers
from transformers import Trainer, TrainingArguments

def trainModel(model_class,tokenizer_class,pretrained_weights,data,evaluation_data=None,max_length=512,batch_size=4):
  #Prepare tokenizer
  tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

  #Prepare model
  model = model_class.from_pretrained(pretrained_weights,
                                    trust_remote_code=True,
                                    torch_dtype="auto",
                                    load_in_8bit=True)
  model.resize_token_embeddings(len(tokenizer))

  #Prepare encodings
  X_encodings = tokenizer(data,  max_length=max_length,return_tensors="pt",truncation=True,padding=True)
  print(X_encodings["input_ids"].shape)
  # Encoded data
  input_ids = X_encodings["input_ids"]
  attention_mask = X_encodings["attention_mask"]
  labels = X_encodings["input_ids"]

  #Evaluation data
  if evaluation_data is not None:
      eval_encodings = tokenizer(
          evaluation_data,
          max_length=max_length,
          return_tensors="pt",
          truncation=True,
          padding=True,
      )
      eval_input_ids = eval_encodings["input_ids"]
      eval_attention_mask = eval_encodings["attention_mask"]
      eval_labels = eval_encodings["input_ids"]

      eval_dataset = AutofillDataset(eval_input_ids, eval_attention_mask, eval_labels)
  else:
        eval_dataset = None


  #Prepare dataset for training
  autofill_dataset = AutofillDataset(input_ids, attention_mask, labels) #CLASS

  dataloader = DataLoader(autofill_dataset, batch_size=batch_size, shuffle=True)

  trainingProcedure(autofill_dataset,eval_dataset,model,batch_size) #FUNCTION


**PROCES TRENOWANIA**

- DANE

In [7]:
import transformers
import json

with open('text_chunks.json', 'r') as f:
    chunks = json.load(f)
len(chunks)

#Split data
train_data = chunks[:int(0.8*len(chunks))]
eval_data = chunks[int(0.8*len(chunks)):]
len(train_data),len(eval_data),eval_data[:10]


##
model_class = transformers.DistilBertForMaskedLM
tokenizer_class = transformers.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

- WYWOŁANIE

In [None]:
trainModel(model_class,tokenizer_class,pretrained_weights,train_data,eval_data)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 30522. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


torch.Size([104627, 257])




Step,Training Loss,Validation Loss
500,2.1055,0.017339
1000,0.0054,0.001997
1500,0.0021,0.000971




**TEST**

In [None]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)
unmasker("Hello I'm a [MASK] model.")

