In [None]:
!pip install -q accelerate==0.21.0 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.40.2 --progress-bar off
!pip install -q transformers==4.31.0 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off

In [None]:
import pandas as pd

In [None]:
df = pd.read_table("output_training.txt", header=None, dtype={0: str}).rename(
    columns={0: "pk", 1: "label", 2: "text"}
)
df = df[df.columns[::-1]]
df.drop('pk',axis=1,inplace=True)
df.columns = ['input','output']
df['output'] = df['output'].replace([0,1],['non-canonical','canonical'])
df['instruction'] = 'Categorize this text into non-canonical  or canonical '
df = df[['instruction','input','output']]
df.to_csv('final_data.csv',index=False)
df.head()

Unnamed: 0,instruction,input,output
0,Categorize this text into non-canonical or ca...,Le infrastrutture come fattore di competitivit...,non-canonical
1,Categorize this text into non-canonical or ca...,Negli ultimi anni la dinamica dei polo di attr...,non-canonical
2,Categorize this text into non-canonical or ca...,Il raggiungimento e il mantenimento di posizio...,non-canonical
3,Categorize this text into non-canonical or ca...,Quest'ultimo è funzione di variabili struttura...,non-canonical
4,Categorize this text into non-canonical or ca...,"Il contesto milanese, se da un lato è stato te...",canonical


In [None]:
df.shape

(9758, 3)

In [None]:
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer

In [None]:
!huggingface-cli login --token YOUR_TOKEN_HERE

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Creating Bitsandbytes Configuration

In [None]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

### Loading Hugging Face Model and Tokenizer

In [None]:
def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model name
    :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU device and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)

    # Set padding token as EOS token
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

### Initializing Transformers and Bitsandbytes Parameters

We will now initialize input parameters for the `transformers` and `bitsandbytes` modules.

In [None]:
################################################################################
# transformers parameters
################################################################################

# The pre-trained model from the Hugging Face Hub to load and fine-tune
model_name = "meta-llama/Llama-2-7b-hf"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

Finally, we will call the above functions to get `model` and `tokenizer` objects.

In [None]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

model, tokenizer = load_model(model_name, bnb_config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### Loading Dataset

In [None]:
# The instruction dataset to use
dataset_name = "/content/final_data.csv"

In [None]:
# Load dataset
dataset = load_dataset("csv", data_files = dataset_name, split = "train")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Number of prompts: 9758
Column names are: ['instruction', 'input', 'output']


The `load_dataset` function will convert the CSV file into a dictionary of prompts. We can look at a random prompt in the dataset using a random index.

In [None]:
dataset[randrange(len(dataset))]

{'instruction': 'Categorize this text into non-canonical  or canonical ',
 'input': "La mobilitazione antirazzista e democratica ha avuto altri due momenti importanti: un dibattito al quale ha preso parte anche il filosofo Bernard Henry Levy e un concerto in concomitanza ed in opposizione ai fuochi d'artificio, ai quali ha assistito Le Pen.",
 'output': 'canonical'}

### Creating Prompt Template

After loading the instruction dataset, we will define the `create_prompt_formats` function to create a prompt template against each prompt in our dataset and save it in a new dictionary key `text` for further data preprocessing and fine-tuning.

In [None]:
def create_prompt_formats(sample):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset

    :param sample: Prompt or sample from the instruction dataset
    """

    # Initialize static strings for the prompt template
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n\n".join(parts)

    # Store the formatted prompt template in a new key "text"
    sample["text"] = formatted_prompt

    return sample

In [None]:
create_prompt_formats(dataset[randrange(len(dataset))])

{'instruction': 'Categorize this text into non-canonical  or canonical ',
 'input': "Migliore al Nord dove resta nell'indigenza un nucleo ogni venti.",
 'output': 'canonical',
 'text': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCategorize this text into non-canonical  or canonical \n\nInput:\nMigliore al Nord dove resta nell'indigenza un nucleo ogni venti.\n\n### Response:\ncanonical\n\n### End"}

In [None]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

### Tokenizing Dataset Batch


In [None]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

### Preprocessing Dataset

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["instruction", "input", "output", "text"],
    )

    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

In [None]:
# Random seed
seed = 33

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

Found max lenth: 4096
Preprocessing dataset...


Map:   0%|          | 0/9758 [00:00<?, ? examples/s]

Map:   0%|          | 0/9758 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9758 [00:00<?, ? examples/s]

We can now look at the preprocessed dataset, which contains tokens or IDs.

In [None]:
print(preprocessed_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 9758
})


In [None]:
print(preprocessed_dataset[0])

{'input_ids': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 29907, 20440, 675, 445, 1426, 964, 1661, 29899, 3068, 265, 936, 29871, 470, 24420, 29871, 13, 13, 4290, 29901, 13, 296, 25983, 2032, 18081, 3771, 29871, 29955, 20299, 29874, 443, 378, 1341, 10268, 5187, 724, 29878, 1219, 378, 5857, 7987, 3300, 10100, 12615, 2148, 13536, 3572, 29915, 29872, 4599, 423, 321, 454, 9905, 899, 280, 20752, 8312, 639, 15592, 598, 980, 5161, 273, 4487, 29892, 611, 1661, 907, 1867, 1354, 534, 11556, 652, 443, 3438, 29877, 2702, 29877, 1146, 10203, 598, 270, 1219, 923, 11352, 704, 29875, 1803, 4243, 1302, 2173, 311, 378, 439, 5481, 4884, 8478, 3943, 4698, 16677, 447, 1439, 517, 394, 12125, 29877, 12718, 9858, 360, 2172, 29892, 27410, 487, 21221, 2005, 10765, 1113, 360, 29915, 20988, 29892, 376, 29881, 1219, 923, 3152, 20394, 599, 29915, 536, 457, 628, 26333, 263, 2225, 29883, 513, 406,

### Creating PEFT Configuration

In [None]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

In [None]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [None]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

### Fine-tuning the Pre-trained Model

In [None]:
def fine_tune(model,
          tokenizer,
          dataset,
          lora_r,
          lora_alpha,
          lora_dropout,
          bias,
          task_type,
          per_device_train_batch_size,
          gradient_accumulation_steps,
          warmup_steps,
          max_steps,
          learning_rate,
          fp16,
          logging_steps,
          output_dir,
          optim):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param dataset: Preprocessed training dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model = model,
        train_dataset = dataset,
        args = TrainingArguments(
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = fp16,
            logging_steps = logging_steps,
            output_dir = output_dir,
            optim = optim,
        ),
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    model.config.use_cache = False

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok = True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

Initializing QLoRA and TrainingArguments parameters below for training.

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
task_type = "CAUSAL_LM"

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
max_steps = 20

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True

# Log every X updates steps
logging_steps = 1

Calling the `fine_tune` function below to fine-tune or instruction-tune the pre-trained model on our preprocessed news classification instruction dataset.

In [None]:
fine_tune(model,
      tokenizer,
      preprocessed_dataset,
      lora_r,
      lora_alpha,
      lora_dropout,
      bias,
      task_type,
      per_device_train_batch_size,
      gradient_accumulation_steps,
      warmup_steps,
      max_steps,
      learning_rate,
      fp16,
      logging_steps,
      output_dir,
      optim)

LoRA module names: ['down_proj', 'k_proj', 'q_proj', 'up_proj', 'gate_proj', 'v_proj', 'o_proj']
All Parameters: 3,540,389,888 || Trainable Parameters: 39,976,960 || Trainable Parameters %: 1.1291682911958425
Training...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.8699
2,2.9094
3,2.8309
4,2.2501
5,1.7151
6,1.473
7,1.107
8,1.5224
9,1.1861
10,1.1539


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   164772GF
  train_loss               =     1.4653
  train_runtime            = 0:02:04.16
  train_samples_per_second =      0.644
  train_steps_per_second   =      0.161
{'train_runtime': 124.1648, 'train_samples_per_second': 0.644, 'train_steps_per_second': 0.161, 'total_flos': 176923142479872.0, 'train_loss': 1.4652810275554657, 'epoch': 0.01}
Saving last checkpoint of the model...


With these steps, we have fine-tuned a popular open-source pre-trained model, Llama-2-7B, on an instruction dataset that we created for news classification!

We can see from the log that there are 3,540,389,888 parameters in the model, out of which 39,976,960 are trainable. That's approximately 1% of the total parameters. The model trained for 20 steps and converged at a loss value of 1.4. It is possible that the converged weights are not the best weights. We can fix this by adding `EarlyStoppingCallback` to the `trainer`, which would regularly evaluate the model on a validation dataset and keep only the best weights.

### Merging Weights & Pushing to Hugging Face

After saving the fine-tuned weights, we can create our fine-tuned model by merging the fine-tuned weights and saving it to a new directory with its tokenizer. By performing this step, we can have a memory-efficient, fine-tuned model and tokenizer for inference. We will also push the fine-tuned model and its associated tokenizer to Hugging Face Hub for public usage.


In [None]:
# Load fine-tuned weights
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map = "auto", torch_dtype = torch.bfloat16)
# Merge the LoRA layers with the base model
model = model.merge_and_unload()

# Save fine-tuned model at a new location
# output_merged_dir = "results/news_classification_llama2_7b/final_merged_checkpoint"
# os.makedirs(output_merged_dir, exist_ok = True)
# model.save_pretrained(output_merged_dir, safe_serialization = True)

# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False)}, clean_up_tokenization_spaces=False)

In [None]:
# Fine-tuned model name on Hugging Face Hub
new_model = "avinasht/Non_Canonical-llama-2-7b"

In [None]:
# Push fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_auth_token = True)
tokenizer.push_to_hub(new_model, use_auth_token = True)

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/avinasht/Non_Canonical-llama-2-7b/commit/7b4a6860ddfffb1f3af8ddf49cec08f76d277366', commit_message='Upload tokenizer', commit_description='', oid='7b4a6860ddfffb1f3af8ddf49cec08f76d277366', pr_url=None, pr_revision=None, pr_num=None)

# Prediction

In [None]:
!pip install --quiet  torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade

!pip install --quiet langchain einops accelerate transformers bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

Check out the fine-tuned model on Hugging Face: https://huggingface.co/sahayk/news-classification-18-llama-2-7b

In [None]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import os
import torch
# Define Model ID
model_id = 'avinasht/Non_Canonical-llama-2-7b' #"tiiuae/falcon-7b-instruct"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load Model
model = AutoModelForCausalLM.from_pretrained(model_id,
    cache_dir='/opt/workspace/',torch_dtype=torch.bfloat16, trust_remote_code=True,
    device_map="auto", offload_folder="offload")
# model
# Set PT model to inference mode
model.eval()

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

In [None]:
from langchain import PromptTemplate,  LLMChain


template = """### Instruction:
Given the following text, your job is to Classify the text into canonical or non-canonical. Reply with only one word: canonical or non-canonical
Input: {}

### Response:
"""

In [None]:
with open('input.txt') as fp:
  data = fp.readlines()
data = [new_sentence.strip() for new_sentence in data]

In [None]:
for indx,new_sentence in enumerate(data):
  input_sentence = template.format(new_sentence.strip())

  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
  prediction = pipe(input_sentence)
  pred = prediction[0]['generated_text'].split('### Response:')[1].split('### End:')[0].strip()
  # prediction = classify(new_sentence)
  print(f'Input Sentence {indx+1}: ', new_sentence)
  print('Predicted Class: ', pred)

Input Sentence 1:  Io, per me, amo le strade che riescono agli erbosi fossi dove in pozzanghere mezzo seccate agguantano i ragazzi qualche sparuta anguilla.
Predicted Class:  non-canonical
Input Sentence 2:  Sono i silenzi in cui si vede in ogni ombra umana che si allontana qualche disturbata Divinità.
Predicted Class:  non-canonical
Input Sentence 3:  s'affolta il tedio dell'inverno sulle case, la luce si fa avara - amara l'anima.
Predicted Class:  non-canonical
Input Sentence 4:  Nuvole in viaggio, chiari reami di lassù! D'alti Eldoradi malchiuse porte! e il mare che scaglia a scaglia, livido, muta colore lancia a terra una tromba di schiume intorte;
Predicted Class:  non-canonical
Input Sentence 5:  il vento che nasce e muore nell'ora che lenta s'annera suonasse te pure stasera scordato strumento, cuore.
Predicted Class:  non-canonical
Input Sentence 6:  Raggiorna, lo presento da un albore di frusto argento alle pareti: lista un barlume le finestre chiuse.
Predicted Class:  canonica