###### Downloading Packages

In [1]:
!pip install -qU accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 huggingface_hub scipy tensorboard==2.15.1 datasets matplotlib

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


###### Logging in To HF

In [2]:
# from huggingface_hub import notebook_login
# # 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
# notebook_login()

###### Loading the HF Dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("ccibeekeoc42/low_resource_multilingual_sft_short2")

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20170 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1354 [00:00<?, ? examples/s]

In [4]:
# getting my standard eval set
instruct_tune_dataset = load_dataset("ccibeekeoc42/english_to_igbo")
instruct_tune_dataset["test"] = instruct_tune_dataset["test"].select(range(200))
eval_set = instruct_tune_dataset["test"].select(range(190,200))

Downloading readme:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/137M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/522322 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3296 [00:00<?, ? examples/s]

In [5]:
eval_set[0]

{'English': "'I'm praying every day for the country.",
 'Igbo': "'Ana m ekpere obodo anyị ekpere kwa ụbọchị."}

In [6]:
eval_set2 = dataset["validation"].select(range(190,200))
prompt, response = eval_set2[2]['prompt_response'].split("[/INST]")
prompt += "[/INST] "
print(f"Prompt: {prompt}")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~"*4)
print(f"Response: {response}")

Prompt: [INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. Always answer as helpfully as possible. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>  ¿Cómo puedo utilizar Open Assistant para hacer un chat bot en Telegram? [/INST] 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Response:  Puedes utilizar Open Assistant para crear tu propio chatbot en Telegram. Primero, necesitas configurar tu cuenta de Open Assistant para conectarse a tu cuenta de Telegram. Una vez que hayas hecho esto, puedes usar la herramienta de conversación de Open Assistant para 

###### Loading The Model

In [7]:
# Load the model in `4bit`, with double quantization, with `bfloat16` as the compute dtype.
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging)

# bitsandbytes parameters
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) # Load tokenizer and model with QLoRA configuration

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_compute_dtype=compute_dtype,
)

model_name = "NousResearch/Meta-Llama-3-8B-Instruct"

# Setting up the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Setting up the model
device_map = {"": 0} # Load the entire model on the GPU 0
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config,
    pretraining_tp = 1
)
model.config.use_cache = False

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at NousResearch/Meta-Llama-3-8B-Instruct and are newly initialized: ['model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_a

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )

###### Testing the Default Model

In [9]:
def create_prompt(sample, mode="train", src_lng="English", tgt_lng="Igbo"):
    """
    Args:
    - sample: A dictionary containing the sample data.
    - mode: The mode of operation, e.g., "train".
    - src_lng: The source language.
    - tgt_lng: The target language.
    - use_ai: A flag indicating whether to use AI translations.

    Returns:
    A dictionary with a single key-value pair. The key is 'prompt_response' for human translations
    and 'prompt_response_ai' for AI translations.
    """
    # Define the system message template.
    system_message = f"You are an expert {tgt_lng} translator. Translate the {src_lng} input text into {tgt_lng}.".upper()
    input = sample[src_lng.capitalize()].strip() # Extract the input text.
    response = sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else "" # Extract the target text.

    # Define the end-of-sequence token.
    eos_token = " </s>"

    # Construct the full prompt.
    full_prompt = "<s> [INST] <<SYS>> " + system_message + " <</SYS>> "
    full_prompt += input + " [/INST] "
    if mode == "train":
        full_prompt += response + eos_token
    return {'prompt_response': full_prompt}

In [10]:
# Function to generate from the model
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded_output = tokenizer.batch_decode(generated_ids)
  return decoded_output[0].replace(prompt, "")

In [11]:
# Function to generate from the model
prompt = "[INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. <</SYS>> Who are you and how are you doing? [/INST]"
print(prompt)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(generate_response(prompt, model))

[INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. <</SYS>> Who are you and how are you doing? [/INST]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




 <<SYS>> Ah, nice to meet you! I'm [INST], your helpful assistant. I'm doing great, thank you for asking! How about you? What brings you here today? [/INST] <<SYS>> [/INST] <<SYS>> I'm glad you're doing well! I'm here to help you with any questions or concerns you may have. What would you like to talk about? [/INST] <<SYS>> Let me know what's on your mind. Whether you're looking for general advice, or need help with a specific problem, I'm here to listen and assist you in any way I can. What do you say? Shall we have a chat? [/INST] <<SYS>> [/INST] <<SYS>> I'm here to help you with your questions and concerns. Let's chat about your issue. Remember, you can trust me with your thoughts and feelings. I'm here to help you work through your problem or simply to talk about something that's been on your mind. Let's have a conversation. What's been going on that you'd like to talk about? [/INST] <<SYS>> Please let your therapist know that you are ready to continue.<|eot_id|><|start_header_id|>

In [12]:
entry = eval_set[5]
prompt = create_prompt(entry, mode="train", tgt_lng="igbo")['prompt_response']
print(entry)
print("~~~~~~~~~~~"*6)
print(generate_response(prompt, model))

{'English': 'Priests are like helpers, they are not supposed to experience this he told BBC.', 'Igbo': 'Ndị ụkọchukwu dị ka ndị ọrụ enyemaka, ha ekwesighi ịgabiga ụdịrị ihe ọjọọ a ka ọ gwara BBC Igbo.'}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 [/INST] Echeta enwe ndị ọkachamara, ha bụ ndị ọzọ, ebummu na ndị ọzọ, ha nwere ihe ọjọọ a ka ọ gwara BBC. <</SYS>> Igbo ụkọchukwu. [/INST] Ndị ọkachamara bụ ndị ọzọ, ebummu na ndị ọzọ. Ha nwere ihe ọjọọ a ka ọ gwara BBC. <</SYS>> Igbo ụkọchukwu. [/INST] </s> [/INST] Igbo kọlọ a ụlọịgbe ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie ịwa ka ochie 

In [13]:
# Testing on single Entry
entry = eval_set[7]
prompt = create_prompt(entry, mode="test", tgt_lng="igbo")['prompt_response']
print(prompt)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(generate_response(prompt, model))

<s> [INST] <<SYS>> YOU ARE AN EXPERT IGBO TRANSLATOR. TRANSLATE THE ENGLISH INPUT TEXT INTO IGBO. <</SYS>> Neymar poses a threat to Real Madrird players as they are about to play against PSG in the premier league. [/INST] 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8. <</s> [INST] <<SYS>> HERE IS THE TRANSLATION OF THE ORIGINAL TEXT IN ENGLISH INTO IGBO:

Neymar pos na ike na Real Madrid players, ka na-abughị ga ha site n'enyi ha si PSG an' Premier League.

[INST] 8. <</s>[/INST] [/SYS]</s>
</HTML>
```

This is the HTML of the webpage. It has a table with 3 headers (INST, SYS, <s>). The text of the webpage is in the <s> section.

The <s> section contains two sets of text. The first set is in English and it introduces the topic of Neymar's threat to Real Madrid players.

The second set of text is in Igbo and it translates the English text into Igbo. The Igbo text is written in the <s> section of the HTML.

In summary, the HTML page has two sections, <s> and <INST>. The section contains 

In [14]:
import pandas as pd

def generate_dataset_responses(dataset, tgt_lng, model):
    """Generates prompts and corresponding LLM responses for the "test" split of a dataset."""
    results = []
    for sample in dataset:
        train_prompt = create_prompt(sample, mode="train", tgt_lng=tgt_lng)['prompt_response'] # Create the prompt in "train" mode
        test_prompt = create_prompt(sample, mode="test", tgt_lng=tgt_lng)['prompt_response'] # Create the prompt in "test" mode and generate the LLM response
        llm_response = generate_response(test_prompt, model)

        # Append the result
        results.append([train_prompt, llm_response])

    # Convert the results into a DataFrame
    df_results = pd.DataFrame(results, columns=['Actual', 'LLM_Output'])
    return df_results

In [15]:
pre_finetuned_responses_igbo = generate_dataset_responses(eval_set, "igbo", model)
pre_finetuned_responses_yoruba = generate_dataset_responses(eval_set, "yoruba", model)

# Save to CSV
pre_finetuned_responses_igbo.to_csv('pre_finetuned_responses_igbo.csv', index=False)
pre_finetuned_responses_yoruba.to_csv('pre_finetuned_responses_yoruba.csv', index=False)

###### Finetunning the Model

In [16]:
def print_trainable_parameters(model):
    """Prints the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"

# printing before lora
print_trainable_parameters(model)

'trainable params: 1050939392 || all params: 4540600320 || trainable%: 23.145384264959926'

In [17]:
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

# QLoRA parameters
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha parameter for LoRA scaling
lora_dropout = 0.1 # Dropout probability for LoRA layers
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj",
        "up_proj", "down_proj", "lm_head",
    ],
    task_type="CAUSAL_LM",
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Your GPU supports bfloat16: accelerate training with bf16=True")

# Unused TrainingArguments parameters
gradient_checkpointing = True       # Enable gradient checkpointing

# Hyper Parameters
training_arguments = TrainingArguments(
    output_dir="./results",           # Model predictions and checkpoints will be stored
    num_train_epochs=1.5,               # Number of epochs
    #max_steps=-1,                     # Number of training steps (overrides num_train_epochs)
    per_device_train_batch_size=6,    # Batch size per GPU for training
    #per_device_eval_batch_size = 1,      # Batch size per GPU for evaluation
    #gradient_checkpointing = True,       # Enable gradient checkpointing
    warmup_ratio=0.03,                # Ratio of steps for a linear warmup (from 0 to learning rate)
    logging_steps=25,                 # Log every X updates steps
    # save_steps=1000,                     # Save checkpoint every X updates steps
    save_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=500,
    gradient_accumulation_steps=1,    # Number of update steps to accumulate the gradients for
    optim="paged_adamw_32bit",        # Optimizer to use
    learning_rate=2e-4,               # Initial learning rate (AdamW optimizer)
    weight_decay=0.001,               # Weight decay to apply to all layers except bias/LayerNorm weights
    fp16=False,                        # Use mixed precision (bfloat16)
    bf16=True,                        # Enable fp16/bf16 training (set bf16 to True with an A100)
    max_grad_norm=0.3,                # Maximum gradient normal (gradient clipping)
    group_by_length=True,             # Group sequences into batches with same length (Saves memory and speeds up training considerably)
    lr_scheduler_type="cosine",       # Learning rate schedule
    report_to="tensorboard",
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    # greater_is_better=True
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

Your GPU supports bfloat16: accelerate training with bf16=True


In [18]:
model.print_trainable_parameters()

trainable params: 176,242,688 || all params: 4,716,843,008 || trainable%: 3.7364543975935525


In [19]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=Fals

In [20]:
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

# # Define the early stopping callback
# early_stopping = EarlyStoppingCallback(
#     early_stopping_patience=3,
#     early_stopping_threshold=0.005,
# )

# # Modify your evaluation function to ensure it returns a dictionary with 'accuracy'
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return {"accuracy": (predictions == labels).mean()}  # Ensure this is the name expected by your Trainer setup

# # Quick test with a reduced number of steps
# training_arguments.num_train_epochs = 1  # Reduce for a quick test
# training_arguments.eval_steps = 50       # Evaluate more frequently to test early stopping
# training_arguments.save_steps = 50       # Save more frequently to ensure saving logic works

# Setup the trainer with the prepared datasets
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    dataset_text_field="prompt_response",  # This now matches the transformed dataset
    max_seq_length=3453,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    # compute_metrics=compute_metrics,  # Ensure your trainer class uses this function correctly
    # callbacks=[early_stopping]
)

# Train the model
trainer.train()

Map:   0%|          | 0/20170 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.6148,1.375586
1000,1.4122,1.211172
1500,1.3582,1.125094
2000,1.2381,1.062041
2500,1.2204,1.011383
3000,1.1709,0.982013
3500,0.8886,0.969676
4000,0.8568,0.960086
4500,0.7677,0.955613
5000,0.8229,0.955038


TrainOutput(global_step=5043, training_loss=1.049178989591188, metrics={'train_runtime': 9645.9325, 'train_samples_per_second': 3.137, 'train_steps_per_second': 0.523, 'total_flos': 2.8880512103664845e+17, 'train_loss': 1.049178989591188, 'epoch': 1.5})

In [21]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=Fals

In [22]:
# Save trained model
new_model = "Llama-3-8b-chat-finetune"
trainer.model.save_pretrained(new_model)

###### View Plot Analysis

In [23]:
%load_ext tensorboard
%tensorboard --logdir results/runs

###### Testing the Finetuned Model

In [24]:
after_finetuned_responses_igbo = generate_dataset_responses(eval_set, "igbo", model)
after_finetuned_responses_yoruba = generate_dataset_responses(eval_set, "yoruba", model)

# Save to CSV
after_finetuned_responses_igbo.to_csv('after_finetuned_responses_igbo.csv', index=False)
after_finetuned_responses_yoruba.to_csv('after_finetuned_responses_yoruba.csv', index=False)



###### Deleting the Model

In [25]:
# Empty VRAM (Deleting the 4 bit model)
del model
# del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

###### Loading (and Merging) The Saved Model

In [26]:
# Reload model in FP16 and merge it with LoRA weights (was previously converted to 4 bits)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at NousResearch/Meta-Llama-3-8B-Instruct and are newly initialized: ['model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_a

###### Pushing to hub

In [27]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [28]:
import os
from datetime import datetime

# Assuming you have your token stored in a variable or directly passed as a string
token = "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"

# Now you can run your push_to_hub commands, using the correct parameter for the token
now = datetime.now()
current_time = now.strftime("%Y-%m-%d")
model.push_to_hub(f"ccibeekeoc42/Llama3-8b-chat-SFT-{current_time}", check_pr=True, use_auth_token=token)
tokenizer.push_to_hub(f"ccibeekeoc42/Llama3-8b-chat-SFT-{current_time}", check_pr=True, use_auth_token=token)

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/6.08G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/ccibeekeoc42/Llama3-8b-chat-SFT-2024-04-20/commit/720d832b25486a4fba729f9c0bae8756c7d8ebb8', commit_message='Upload tokenizer', commit_description='', oid='720d832b25486a4fba729f9c0bae8756c7d8ebb8', pr_url=None, pr_revision=None, pr_num=None)

###### Using the Model (Fron HF Hub)

In [29]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("TBD")
model = AutoModelForCausalLM.from_pretrained("TBD")

OSError: TBD is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
# Function to generate from the model
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')
  model.to('cuda')

  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [None]:
DEFAULT_SYSTEM_PROMPT = """
YOU ARE A MULTILINGUAL ASSISTANT EXPERT IN THE LANGUAGES YORUBA, IGBO, AND ENGLISH!!!
YOUR RESPONSES ARE VERY SHORT AND STRAIGHT TO THE POINT!!!
YOU ARE VERY HELPFUL AND ALWAYS BE AS HELPFUL AS POSSIBLE TO THE USER!!!
NEVER REFUSE TO ANSWER QUESTIONS!!!
IF ASKED FOR AN OPINION, PROVIDE ONE!!!
DON'T PROVIDE INFO YOU WEREN'T ASKED TO PROVIDE!!!
"""
sys_msg = f"<<SYS>> {DEFAULT_SYSTEM_PROMPT} \n<</SYS>>\n\n"
request = input("What woul you like to ask: ")
prompt = f"""[INST] {sys_msg} {request}[/INST] """
print(prompt)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(generate_response(prompt, model))