In [None]:
# -*- coding: utf-8 -*-
"""Fine-tune Custom Large Language Model in Google Colab

Updated for best practices and readability. Last update: 25 Sep 2023
"""

# Install required packages
!pip install -q torch torchvision torchaudio
!pip install -q accelerate peft bitsandbytes transformers trl xformers peft datasets

# Import libraries
import os
import torch
import warnings
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
import logging
logging.basicConfig(filename='training_errors.log', level=logging.ERROR)


from peft import LoraConfig, get_peft_model
torch.backends.cudnn.benchmark = True


# Suppress Warnings
warnings.filterwarnings("ignore")

# Set device and environment variables
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=64'

# Model and Dataset Configuration
model_name = "NousResearch/Llama-2-7b-chat-hf"
dataset_name = "mlabonne/guanaco-llama2-1k"

# Load dataset
dataset = load_dataset(dataset_name, split="train")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained("outputs")


# Initialize model with 4-bit precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)

# Enable Gradient Checkpointing
model.gradient_checkpointing_enable()

# LoRA Configuration
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["gate_proj", "down_proj", "lm_head"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, config)

# Function to print trainable parameters
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable Parameters: {trainable_params}")

print_trainable_parameters(model)

# Training parameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=4,  # Gradient accumulation
    optim="paged_adamw_8bit",
    save_steps=0,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard"
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    max_seq_length=512,
    train_dataset=dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

try:
    # Train the model
    trainer.train()
except Exception as e:
    print(f"An error occurred: {e}")
    logging.error(f"An error occurred: {e}")
    torch.cuda.empty_cache()  # Clear GPU cache



model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)



# Run text generation pipeline
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])





In [None]:
# -*- coding: utf-8 -*-
"""Fine-tune Custom Large Language Model in Google Colab

Updated for best practices, modularity, and readability. Last update: 25 Sep 2023
"""

# Install required packages
!pip install -q torch torchvision torchaudio
!pip install -q accelerate peft bitsandbytes transformers trl xformers peft datasets pyngrok
!pip install -q optuna  # For hyperparameter tuning

# Import libraries
import os
import torch
import warnings
import logging
import optuna  # For hyperparameter tuning
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
# Import pyngrok
from pyngrok import ngrok
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

# Initialize logging
logging.basicConfig(filename='training_errors.log', level=logging.ERROR)

# Initialize cuDNN benchmarking for performance optimization
torch.backends.cudnn.benchmark = True

# Suppress Warnings
warnings.filterwarnings("ignore")

# Set device and environment variables
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=64'


# Function to load dataset
def load_training_dataset(dataset_name, split="train"):
    return load_dataset(dataset_name, split=split, cache_dir = "./cache")


# Function to initialize tokenizer
def initialize_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,cache_dir="./cache")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.save_pretrained("outputs")
    return tokenizer


# Function to initialize model
def initialize_model(model_name, bnb_config, device_map={"": 0}):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map, cache_dir="./cache"
    )
    return model


# Function to print trainable parameters
def print_trainable_parameters(model):
    """Print the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


# Function for hyperparameter tuning (placeholder)
def hyperparameter_tuning():
    # Use Optuna or other libraries to perform hyperparameter tuning here
    pass


# Function to train model
def train_model(model, dataset, tokenizer, training_arguments):
    trainer = SFTTrainer(
        model=model,
        max_seq_length=512,
        train_dataset=dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )
    try:
        # Train the model
        trainer.train()
    except Exception as e:
        print(f"An error occurred: {e}")
        logging.error(f"An error occurred: {e}")
        torch.cuda.empty_cache()  # Clear GPU cache

def prepare_for_inference(trainer, model, tokenizer):
    # Save the model
    model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
    model_to_save.save_pretrained("outputs")

    # Load LoRA config and prepare model for inference
    lora_config = LoraConfig.from_pretrained('outputs')
    model = get_peft_model(model, lora_config)

    # Run text generation pipeline
    prompt = "What is a large language model?"
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    print(result[0]['generated_text'])


# Main code
if __name__ == "__main__":

    try:# Load dataset
      dataset = load_training_dataset(dataset_name="mlabonne/guanaco-llama2-1k")

      # Initialize tokenizer
      tokenizer = initialize_tokenizer(model_name="NousResearch/Llama-2-7b-chat-hf")

      # Initialize model
      bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
      )
      model = initialize_model(model_name="NousResearch/Llama-2-7b-chat-hf", bnb_config=bnb_config)

      # Print trainable parameters
      print_trainable_parameters(model)

      # Training arguments
      training_arguments = TrainingArguments(
          output_dir="./results",
          num_train_epochs=1,
          per_device_train_batch_size=1,  # Reduced batch size
          gradient_accumulation_steps=4,  # Gradient accumulation
          optim="paged_adamw_8bit",
          save_steps=0,
          logging_steps=100,
          learning_rate=2e-4,
          weight_decay=0.001,
          fp16=False,
          bf16=False,
          max_grad_norm=0.3,
          max_steps=-1,
          warmup_ratio=0.03,
          group_by_length=True,
          lr_scheduler_type="cosine",
          report_to="tensorboard"
      )

      # Train model
      train_model(model, dataset, tokenizer, training_arguments)

      prepare_for_inference(trainer, model, tokenizer)

      # Additional code for TensorBoard in Colab
      # Start TensorBoard
      get_ipython().system_raw('tensorboard --logdir ./results --host 0.0.0.0 --port 6006 &')

      # Create a tunnel
      public_url = ngrok.connect(port=6006)

      # Print the public URL
      print(f"TensorBoard can be accessed here: {public_url}")

    except Exception as e:
        print(f"An error occurred: {e}")
        logging.error(f"An error occurred: {e}")
        torch.cuda.empty_cache()  # Clear GPU cache


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/258.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m256.0/258.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Trainable Parameters: 262410240


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [1]:
# -*- coding: utf-8 -*-
"""Fine-tune Custom Large Language Model in Google Colab

Updated for best practices, modularity, and readability. Last update: 25 Sep 2023
"""

# Install required packages
!pip install -q torch torchvision torchaudio
!pip install -q accelerate peft bitsandbytes transformers trl xformers peft datasets pyngrok
!pip install -q optuna  # For hyperparameter tuning

# Import libraries
import os
import torch
import warnings
import logging
import optuna  # For hyperparameter tuning
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from transformers import pipeline  # Added for text generation
from tqdm import tqdm  # Added for progress bar

# Import pyngrok
from pyngrok import ngrok
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

# Initialize logging
logging.basicConfig(filename='training_errors.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize cuDNN benchmarking for performance optimization
torch.backends.cudnn.benchmark = True

# Suppress Warnings
warnings.filterwarnings("ignore")

# Set device and environment variables
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=64'

# Constants
CACHE_DIR = "./cache"
OUTPUTS_DIR = "outputs"
RESULTS_DIR = "./results"

# Function to load dataset
def load_training_dataset(dataset_name, split="train"):
    """
    Load and return a training dataset split.

    Args:
        dataset_name (str): Name of the dataset.
        split (str): Split name (e.g., "train").

    Returns:
        Dataset: The loaded dataset split.
    """
    return load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)

# Function to initialize tokenizer
def initialize_tokenizer(model_name):
    """
    Initialize and return a tokenizer for a given model.

    Args:
        model_name (str): Name of the model.

    Returns:
        AutoTokenizer: The initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=CACHE_DIR)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.save_pretrained(OUTPUTS_DIR)
    return tokenizer

# Function to initialize model
def initialize_model(model_name, bnb_config, device_map={"": 0}):
    """
    Initialize and return a model for fine-tuning.

    Args:
        model_name (str): Name of the model.
        bnb_config (BitsAndBytesConfig): Configuration for quantization.
        device_map (dict): Mapping of devices for multi-GPU training.

    Returns:
        AutoModelForCausalLM: The initialized model.
    """
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        cache_dir=CACHE_DIR
    )
    return model

def shrink_model(model):
    """
    Shrink a given neural network model using Gradient Checkpointing and LoRA (Lossless Rectified Activation) compression.

    Args:
        model (nn.Module): The model to be shrunk.

    Returns:
        nn.Module: The shrunk model with reduced memory footprint.
    """
    # Enable Gradient Checkpointing to reduce memory usage during training
    model.gradient_checkpointing_enable()

    # LoRA Configuration:
    config = LoraConfig(
        r=8,  # Reduction factor for LoRA compression
        lora_alpha=32,  # Alpha parameter for LoRA compression
        target_modules=["gate_proj", "down_proj", "lm_head"],  # Modules to apply LoRA compression
        lora_dropout=0.05,  # Dropout rate for LoRA
        bias="none",  # Bias handling for LoRA
        task_type="CAUSAL_LM"  # Task type for LoRA compression
    )

    # Apply LoRA compression to the model
    return get_peft_model(model, config)


# Function to print trainable parameters

def print_trainable_parameters(model):
    """
    Print the number of trainable parameters in the model.

    Args:
        model (nn.Module): The model to analyze.
    """
    trainable_params = 0  # Initialize trainable parameters count
    all_param = 0  # Initialize total parameters count

    # Iterate through all named parameters in the model
    for _, param in model.named_parameters():
        all_param += param.numel()  # Count the total parameters

        if param.requires_grad:
            trainable_params += param.numel()  # Count trainable parameters

    # Calculate the percentage of trainable parameters
    trainable_percentage = 100 * trainable_params / all_param

    # Print the results
    print(
        f"Trainable Parameters: {trainable_params} || Total Parameters: {all_param} || Trainable Percentage: {trainable_percentage:.2f}%"
    )




# Function for hyperparameter tuning (placeholder)
def hyperparameter_tuning():
    # Use Optuna or other libraries to perform hyperparameter tuning here
    pass

# Function to train model
def train_model(model, dataset, tokenizer, training_arguments):
    """
    Train a language model using the specified dataset and training arguments.

    Args:
        model (AutoModelForCausalLM): The model to train.
        dataset (Dataset): The training dataset.
        tokenizer (AutoTokenizer): The tokenizer for text encoding.
        training_arguments (TrainingArguments): Training configuration.
    """
    trainer = SFTTrainer(
        model=model,
        max_seq_length=512,
        train_dataset=dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )
    try:
        # Train the model with a progress bar
        for epoch in tqdm(range(training_arguments.num_train_epochs), desc="Epochs"):
            trainer.train()
    except Exception as e:
        print(f"An error occurred: {e}, Type: {type(e).__name__}")
        logging.error(f"An error occurred: {e}, Type: {type(e).__name__}")
        torch.cuda.empty_cache()  # Clear GPU cache
    return trainer

# Function to prepare for inference
def prepare_for_inference(trainer, model, tokenizer):
    """
    Prepare the model for inference and demonstrate text generation.

    Args:
        trainer (SFTTrainer): The model trainer.
        model (AutoModelForCausalLM): The trained model.
        tokenizer (AutoTokenizer): The tokenizer for text encoding.
    """
    # Save the model
    model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
    model_to_save.save_pretrained(OUTPUTS_DIR)

    # Load LoRA config and prepare model for inference
    lora_config = LoraConfig.from_pretrained(OUTPUTS_DIR)
    model = get_peft_model(model, lora_config)

    # Run text generation pipeline
    text = "What is a large language model? "
    device = "cuda:0"
    inputs = tokenizer(f"<s>[INST] {text} [/INST]", return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Main code
if __name__ == "__main__":
    try:
        # Load dataset
        dataset = load_training_dataset(dataset_name="mlabonne/guanaco-llama2-1k")

        # Initialize tokenizer
        tokenizer = initialize_tokenizer(model_name="NousResearch/Llama-2-7b-chat-hf")

        # Initialize model
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model = initialize_model(model_name="NousResearch/Llama-2-7b-chat-hf", bnb_config=bnb_config)

        # Enable Gradient Checkpointing
        model.gradient_checkpointing_enable()

        #shrink the model
        model = shrink_model(model)

        # Print trainable parameters
        print_trainable_parameters(model)

        # Training arguments
        training_arguments = TrainingArguments(
            output_dir=RESULTS_DIR,
            num_train_epochs=1,
            per_device_train_batch_size=1,  # Reduced batch size
            gradient_accumulation_steps=4,  # Gradient accumulation
            optim="paged_adamw_8bit",
            save_steps=0,
            logging_steps=100,
            learning_rate=2e-4,
            weight_decay=0.001,
            fp16=False,
            bf16=False,
            max_grad_norm=0.3,
            max_steps=-1,
            warmup_ratio=0.03,
            group_by_length=True,
            lr_scheduler_type="cosine",
            report_to="tensorboard"
        )

        # Train model
        trainer = train_model(model, dataset, tokenizer, training_arguments)

        prepare_for_inference(trainer, model, tokenizer)

    except Exception as e:
        print(f"An error occurred: {e}, Type: {type(e).__name__}")
        logging.error(f"An error occurred: {e} , Type: {type(e).__name__}")
        torch.cuda.empty_cache()  # Clear GPU cache


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/258.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Trainable Parameters: 8022016 || Total Parameters: 3508434944 || Trainable Percentage: 0.23%


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
100,2.0928
200,1.9752


Epochs: 100%|██████████| 1/1 [42:16<00:00, 2536.72s/it]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM

<s>[INST] What is a large language model? [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody. These models are typically trained on vast amounts of text data, such as books, articles, and websites, and are designed to learn the patterns and structures of language.

Large language models are often used in natural language processing (NLP) tasks such as language translation, text summarization, and language generation. They are also used in chatbots, virtual assistants, and other applications where language understanding and generation is required.

Some of the key features of large language models include:

1. Deep learning architecture: Large language models are typically built using deep learning architectures such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks,


ERROR:root:An error occurred: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}
 , Type: PyngrokNgrokHTTPError
Exception in thread Thread-13 (_monitor_process):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pyngrok/process.py", line 140, in _monitor_process
    self._log_line(self.proc.stdout.readline())
  File "/usr/lib/python3.10/encodings/ascii.py", line 26, in decode
    return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 184: ordinal not in range(128)


An error occurred: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}
, Type: PyngrokNgrokHTTPError


In [10]:
    text = "What is a large language model? "
    device = "cuda:0"
    inputs = tokenizer(f"<s>[INST] {text} [/INST]", return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[INST] What is a large language model?  [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody has heard of large language models, but few people know what they are or how they work. In this article, we will explore the definition of a large language model, how it works, and some of its applications.

A large language model is a type of neural network that is trained on
