In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

In [None]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
import torch, os, json, random, bitsandbytes as bnb, torch.nn as nn, psutil
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
import re, gc
from pprint import pprint
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig
import pandas as pd
import requests, shutil
from tqdm import tqdm

In [None]:
HF_TOKEN = "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"
model_name = "meta-llama/Meta-Llama-3-8B"
base_url = f"https://huggingface.co/{model_name}/resolve/main/"

files = [
    "model-00001-of-00004.safetensors",
    "model-00002-of-00004.safetensors",
    "model-00003-of-00004.safetensors",
    "model-00004-of-00004.safetensors",
    "model.safetensors.index.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "config.json",
    "generation_config.json",
    "LICENSE",
    "README.md",
    "USE_POLICY.md",
    ".gitattributes"
]

# Function to download files
def download_file(url, local_filename):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    response = requests.get(url, headers=headers, stream=True)
    response.raise_for_status()

    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 KB
    wrote = 0

    with open(local_filename, 'wb') as file, tqdm(
        desc=local_filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(block_size):
            wrote += len(data)
            file.write(data)
            bar.update(len(data))

# Target directory
target_dir = "/content/model"
os.makedirs(target_dir, exist_ok=True)

for file in files:
    url = f"{base_url}{file}"
    local_path = os.path.join(target_dir, file)  # Save files to the target directory
    print(f"Downloading {file}...")
    download_file(url, local_path)
    print(f"{file} downloaded.")


Downloading model-00001-of-00004.safetensors...


/content/model/model-00001-of-00004.safetensors: 100%|██████████| 4.63G/4.63G [00:55<00:00, 89.3MiB/s]


model-00001-of-00004.safetensors downloaded.
Downloading model-00002-of-00004.safetensors...


/content/model/model-00002-of-00004.safetensors: 100%|██████████| 4.66G/4.66G [00:59<00:00, 84.5MiB/s]


model-00002-of-00004.safetensors downloaded.
Downloading model-00003-of-00004.safetensors...


/content/model/model-00003-of-00004.safetensors: 100%|██████████| 4.58G/4.58G [00:55<00:00, 88.8MiB/s]


model-00003-of-00004.safetensors downloaded.
Downloading model-00004-of-00004.safetensors...


/content/model/model-00004-of-00004.safetensors: 100%|██████████| 1.09G/1.09G [00:13<00:00, 89.8MiB/s]


model-00004-of-00004.safetensors downloaded.
Downloading model.safetensors.index.json...


/content/model/model.safetensors.index.json: 100%|██████████| 23.4k/23.4k [00:00<00:00, 18.8MiB/s]


model.safetensors.index.json downloaded.
Downloading tokenizer.json...


/content/model/tokenizer.json: 100%|██████████| 8.66M/8.66M [00:00<00:00, 12.7MiB/s]


tokenizer.json downloaded.
Downloading tokenizer_config.json...


/content/model/tokenizer_config.json: 100%|██████████| 49.4k/49.4k [00:00<00:00, 5.65MiB/s]


tokenizer_config.json downloaded.
Downloading special_tokens_map.json...


/content/model/special_tokens_map.json: 100%|██████████| 73.0/73.0 [00:00<00:00, 326kiB/s]


special_tokens_map.json downloaded.
Downloading config.json...


/content/model/config.json: 100%|██████████| 654/654 [00:00<00:00, 3.17MiB/s]


config.json downloaded.
Downloading generation_config.json...


/content/model/generation_config.json: 100%|██████████| 177/177 [00:00<00:00, 923kiB/s]


generation_config.json downloaded.
Downloading LICENSE...


/content/model/LICENSE: 100%|██████████| 7.62k/7.62k [00:00<00:00, 20.6MiB/s]


LICENSE downloaded.
Downloading README.md...


/content/model/README.md: 100%|██████████| 35.7k/35.7k [00:00<00:00, 4.55MiB/s]


README.md downloaded.
Downloading USE_POLICY.md...


/content/model/USE_POLICY.md: 100%|██████████| 4.59k/4.59k [00:00<00:00, 20.2MiB/s]


USE_POLICY.md downloaded.
Downloading .gitattributes...


/content/model/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 6.08MiB/s]

.gitattributes downloaded.





In [None]:
# Target directory
model_dir = "/content/model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_dir)

print("Tokenizer and model successfully loaded.")

# Retrieve special tokens map and EOS token details
special_tokens = tokenizer.special_tokens_map_extended
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

print("Special Tokens Map:", special_tokens)
print("EOS Token:", eos_token)
print("EOS Token ID:", eos_token_id)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Tokenizer and model successfully loaded.
Special Tokens Map: {'bos_token': AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'eos_token': AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)}
EOS Token: <|end_of_text|>
EOS Token ID: 128001


In [None]:
def apply_lora_config(model):
    """Apply LoRA configuration to the model."""
    try:
        print("Applying LoRA configuration...")
        lora_config = LoraConfig(
            r=18,
            lora_alpha=8,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.1,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model = get_peft_model(model, lora_config)
        print("LoRA configuration applied successfully.")
        return model
    except Exception as e:
        print("An error occurred while applying LoRA configuration:", e)
        return model
if model and tokenizer:
    model = apply_lora_config(model)

Applying LoRA configuration...
LoRA configuration applied successfully.


In [None]:
# Paths
source_dir = '/content/model/'
target_dir = '/content/quantized/'

# Files to copy
files_to_copy = [
    'LICENSE',
    'README.md',
    'USE_POLICY.md',
    'tokenizer_config.json',
    '.gitattributes',
    'config.json',
    'generation_config.json',
    'special_tokens_map.json',
    'tokenizer.json',
    'model.safetensors.index.json'
]

os.makedirs(target_dir, exist_ok=True)

# Copy files
for file_name in files_to_copy:
    source_file = os.path.join(source_dir, file_name)
    target_file = os.path.join(target_dir, file_name)

    if os.path.exists(source_file):
        shutil.copy(source_file, target_file)
        print(f"Copied {file_name} to {target_file}")
    else:
        print(f"File {file_name} does not exist in {source_dir}")

Copied LICENSE to /content/quantized/LICENSE
Copied README.md to /content/quantized/README.md
Copied USE_POLICY.md to /content/quantized/USE_POLICY.md
Copied tokenizer_config.json to /content/quantized/tokenizer_config.json
Copied .gitattributes to /content/quantized/.gitattributes
Copied config.json to /content/quantized/config.json
Copied generation_config.json to /content/quantized/generation_config.json
Copied special_tokens_map.json to /content/quantized/special_tokens_map.json
Copied tokenizer.json to /content/quantized/tokenizer.json
Copied model.safetensors.index.json to /content/quantized/model.safetensors.index.json


In [None]:
# Define file paths
train_input_file = '/content/train.json'
train_output_file = '/content/train_dataset.json'
test_input_file = '/content/test.json'
test_output_file = '/content/test_dataset.json'

def add_questions_key(input_file, output_file):
    # Read JSON file
    with open(input_file, 'r') as infile:
        data = json.load(infile)

    # Format data with 'questions' key
    formatted_data = {
        "questions": data
    }

    # Write formatted data to a new file
    with open(output_file, 'w') as outfile:
        json.dump(formatted_data, outfile, indent=4)

# Convert training and test datasets to the appropriate format
add_questions_key(train_input_file, train_output_file)
add_questions_key(test_input_file, test_output_file)

print("JSON files have been formatted and saved successfully.")

with open(train_output_file) as json_file:
    train = json.load(json_file)
with open(test_output_file) as json_file:
    test = json.load(json_file)

# Convert data to DataFrame and check
pd.DataFrame(train["questions"]).head()
pd.DataFrame(test["questions"]).head()
pprint(train["questions"][0], sort_dicts=False)
pprint(test["questions"][0], sort_dicts=False)

# Function to check data format
def check_data_format(data):
    if "questions" not in data or not isinstance(data["questions"], list):
        raise ValueError("Data does not contain 'questions' key or it is not a list.")

check_data_format(train)
check_data_format(test)

# Define the prompt format
prompt = """Below is a question paired with an answer. Please write a response that appropriately completes the request.

### Question:
{}

### Answer:
{}"""

# Get special tokens and EOS token from tokenizer
special_tokens = tokenizer.special_tokens_map_extended
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

def formatting_prompts_func(examples):
    questions = examples["question"]
    answers = examples["answer"]

    formatted_texts = []
    for question, answer in zip(questions, answers):
        text = prompt.format(question, answer, answer) + eos_token
        formatted_texts.append(text)

    return {"text": formatted_texts}

def create_and_format_dataset(data):
    dataset_dict = {
        "question": [item["question"] for item in data["questions"]],
        "answer": [item["answer"] for item in data["questions"]],
    }
    dataset = Dataset.from_dict(dataset_dict)

    dataset = dataset.map(formatting_prompts_func, batched=True)
    return dataset

def preprocess_function(examples):
    inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    labels = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

    inputs['labels'] = labels['input_ids']
    inputs['attention_mask'] = inputs['attention_mask']

    return inputs

# Create and format training and test datasets
train_dataset = create_and_format_dataset(train)
test_dataset = create_and_format_dataset(test)

# Create DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Apply preprocessing
dataset['train'] = dataset['train'].map(preprocess_function, batched=True)
dataset['test'] = dataset['test'].map(preprocess_function, batched=True)

print(dataset)


JSON files have been formatted and saved successfully.
{'question': 'What does Site Access Security Clearance refer to?',
 'answer': 'Site Access Security Clearance refers to the level of '
           'authorization required for individuals to access a specific site, '
           'typically to ensure that only authorized personnel are allowed '
           'entry for security reasons.'}
{'question': 'What becomes the safety case for the reactor facility?',
 'answer': 'The licence application and the documents needed to support it, '
           'including the documents the application references, become the '
           'safety case for the reactor facility.'}


Map:   0%|          | 0/5448 [00:00<?, ? examples/s]

Map:   0%|          | 0/1363 [00:00<?, ? examples/s]

Map:   0%|          | 0/5448 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.