In [2]:
# import torch
from model.model import load_model, resize_WEL
from model.transfer_learning import transfer_learning
from model.adapter_config import attach_adapter

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
model, tokenizer = load_model()
#by default it will load llama-2-7B

In [None]:
model, tokenizer = resize_WEL(model, tokenizer)
# This will resize the word embedding layer

In [None]:
# external_representations = #get these from the link given
# model = transfer_learning(model, external_representations, start_index, end_index)
# if you have any hidden representaions of the tokens you've added then use this.
# Download the vectors from here : https://drive.google.com/drive/folders/1bKjqa9N_AAM5PTqL4Z-5RxQHimTyCdJO?usp=sharing

In [None]:
model = attach_adapter(model) #attaching an adapter, it takes LaRA configs ie, rank=alpha=2048 and target_modules = kqvo.

In [7]:
#load your own datasets

# raw_dataset
# {'train': Dataset({
#      features: ['hubert_units'],
#      num_rows: 4383
#  }),
#  'validation': Dataset({
#      features: ['hubert_units'],
#      num_rows: 342
#  })}

In [None]:
def tokenize_function(examples):
    output = tokenizer(examples['hubert_units'], truncation=False, padding=False)
    return output

column_names = ['hubert_units']
tokenized_datasets = {split: raw_dataset[split].map(
                        tokenize_function,
                        batched=True,
                        remove_columns=column_names,
                     ) for split in raw_dataset}

# tokenized_datasets = {split: tokenized_datasets[split].map(lambda example: {'labels': example['input_ids'].copy()}, batched=True) for split in tokenized_datasets}

tokenized_datasets

In [None]:
from itertools import chain

#training a huge dataset as follows:

# block_size = tokenizer.model_max_length
block_size = 4096

def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        #===> {'input_ids': [[1, 2, 3], [4, 5], [6, 7, 8, 9]]} ====== {'input_ids': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
        
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
            
        # total_length = 8500 ==> total_length is then adjusted to 2 * 4096 = 8192. ==> 
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        #t (a list of tokens) is [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], and block_size is 4. => 
        #[1, 2, 3, 4]
        #[5, 6, 7, 8]
        #[9, 10]
        result["labels"] = result["input_ids"].copy()
        return result

lm_datasets = {split: tokenized_datasets[split].map(
                    group_texts,
                    batched=True,
                ) for split in tokenized_datasets}

lm_datasets

In [None]:
from transformers import Trainer, TrainingArguments

#all the arguments according to your use case:

model_version = "__give your_name__"
model_dir = f"{model_version}"
training_args = TrainingArguments(
        run_name=model_version,
        logging_dir=f"speech-text/trainings/{model_dir}/logs", #set your paths to store the checkpoints and logs
        output_dir=f'speech-text/trainings/{model_dir}',
        logging_steps=1,
        per_device_train_batch_size= 1,  # Set batch size to 10 for training
        per_device_eval_batch_size=1,     # Set batch size to 10 for evaluation
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        eval_steps=500,  # Evaluate every 10,000 steps
        learning_rate=2e-5,
        num_train_epochs=3,
        lr_scheduler_type="constant",
        save_strategy="epoch",
        # save_steps=n,  # Save checkpoints every n steps
        fp16=True,
)
#connect your wandb if needed!

# Training
trainer = Trainer(
    model=model, 
    tokenizer=tokenizer, 
    args=training_args, 
    train_dataset=lm_datasets['train'], 
    eval_dataset=lm_datasets['validation'], 
)
tokenizer.save_pretrained(model_version)

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import wandb
wandb.init(
project="Speech-text-LLM", # Name of the dir you wanted to store this run
name=model_version # Run name
)
trainer.train()