In [1]:
%%bash
pip install accelerate -Uqqq --progress-bar off
pip install transformers -Uqqq --progress-bar off
pip install einops -Uqqq --progress-bar off
pip install datasets -Uqqq --progress-bar off
pip install peft -Uqqq --progress-bar off
pip install bitsandbytes -Uqqq --progress-bar off
pip install huggingface_hub -Uqqq --progress-bar off
pip install scipy



[0m

In [2]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
import os
import random
import json
import re

from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

In [4]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print(tokenizer.eos_token)

</s>


In [6]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 23,068,672 || all params: 7,264,800,768 || trainable%: 0.3175403254224521


In [7]:
def tokenize(sample):
    model_inps = tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

with open("./dataset_eng.json", 'r') as fp:
    data = json.load(fp)

    # Randomly select keys from objects, "distance is mandatory"
    new_tracks = []

    for entry in data["tracks"]:
        random_keys = random.sample([key for key in entry.keys() if key != "distance"], 3)

        new_entry = {
            "distance": entry["distance"],
            "ascent": entry["ascent"]
        }
        new_entry.update({key: entry[key] for key in random_keys})

        new_tracks.append(new_entry)

    data["tracks"] = new_tracks

    dataset = Dataset.from_dict(data).to_pandas()
    display(dataset)

Unnamed: 0,tracks,descriptions
0,"{'ascent': '88', 'descent': None, 'distance': ...",Beginner-friendly hike managed by Società degl...
1,"{'ascent': '405', 'descent': '231', 'distance'...","Challenge yourself on Track 421, a 12.04-kilom..."
2,"{'ascent': '153', 'descent': '0', 'distance': ...",Quick and easy hike on Track 421A covering 880...
3,"{'ascent': '270', 'descent': '0', 'distance': ...",Società degli Alpinisti Tridentini manages thi...
4,"{'ascent': '553', 'descent': '0', 'distance': ...","For intermediate hikers, Track 403 offers a 4...."
...,...,...
95,"{'ascent': '314', 'descent': '0', 'distance': ...","Track 175 offers a short and easy hike, coveri..."
96,"{'ascent': None, 'descent': None, 'distance': ...",Limited information available for Track 149. I...
97,"{'ascent': '883', 'descent': None, 'distance':...",Track 439 provides a moderately challenging hi...
98,"{'ascent': '565', 'descent': None, 'distance':...","Track 195, operated by C.A.I., offers a 2-kilo..."


In [8]:
dataset["text"] = dataset[["tracks", "descriptions"]].apply(lambda x:
                                                            f'''
                                                            #INPUT
                                                            {json.dumps(x["tracks"])}
                                                            #OUTPUT
                                                            {x["descriptions"]}''', axis=1)
display(dataset)

Unnamed: 0,tracks,descriptions,text
0,"{'ascent': '88', 'descent': None, 'distance': ...",Beginner-friendly hike managed by Società degl...,\n ...
1,"{'ascent': '405', 'descent': '231', 'distance'...","Challenge yourself on Track 421, a 12.04-kilom...",\n ...
2,"{'ascent': '153', 'descent': '0', 'distance': ...",Quick and easy hike on Track 421A covering 880...,\n ...
3,"{'ascent': '270', 'descent': '0', 'distance': ...",Società degli Alpinisti Tridentini manages thi...,\n ...
4,"{'ascent': '553', 'descent': '0', 'distance': ...","For intermediate hikers, Track 403 offers a 4....",\n ...
...,...,...,...
95,"{'ascent': '314', 'descent': '0', 'distance': ...","Track 175 offers a short and easy hike, coveri...",\n ...
96,"{'ascent': None, 'descent': None, 'distance': ...",Limited information available for Track 149. I...,\n ...
97,"{'ascent': '883', 'descent': None, 'distance':...",Track 439 provides a moderately challenging hi...,\n ...
98,"{'ascent': '565', 'descent': None, 'distance':...","Track 195, operated by C.A.I., offers a 2-kilo...",\n ...


In [9]:
data = Dataset.from_pandas(dataset)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

Tokenizing data:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100
})

In [10]:
training_args = TrainingArguments(
    output_dir="mistral-7b-geodata-finetuning-eng-1500",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    max_steps=1500,
    num_train_epochs=1,
    push_to_hub=True
)



trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
trainer.push_to_hub()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.1423
20,0.5033
30,0.4328
40,0.3863
50,0.4079
60,0.3473
70,0.3442
80,0.3334
90,0.2977
100,0.2872


CommitInfo(commit_url='https://huggingface.co/andrea-coppari/mistral-7b-geodata-finetuning-eng-1500/commit/3b9967b7243af607ab4dbf9af46cef06899a5ebd', commit_message='End of training', commit_description='', oid='3b9967b7243af607ab4dbf9af46cef06899a5ebd', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
import torch

from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True,
                                             torch_dtype=torch.float32)
peft_model = PeftModel.from_pretrained(model, "andrea-coppari/mistral-7b-geodata-finetuning-eng-1500", from_transformers=True)
model = peft_model.merge_and_unload()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/92.3M [00:00<?, ?B/s]

In [12]:
### INFERENCE

In [1]:
import torch
import re

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained("andrea-coppari/mistral-7b-geodata-finetuning-eng-1500", trust_remote_code=True, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
# Esempio fuori dal dataset
inputs = tokenizer(
    '''
#INSTRUCTIONS
Describe a mountain track taking exact numbers from the input. Estimate track's difficulty based on track duration, distance (<1500m is easy, >10000m is hard) and ascent (<250m is easy).
#INPUT
{
"id":"relation/129319"
"ascent":"88",
"descent":null,
"distance": "10200"
"ref":"420A"
"route":"hiking"
}
#OUTPUT
''',
    return_tensors="pt",
    return_attention_mask=False
).to(device)

from time import time
t = time()
outputs = model.generate(**inputs, max_length=250).to(device)
# print(f'Time to generate a message of 256 tokens: {time()-t:.2f} seconds')
text = tokenizer.batch_decode(outputs)[0]

print(text)
# pattern = re.compile(r'#OUTPUT(.*?)#', re.DOTALL)

# match = pattern.search(text)
# if match:
#     re_text = match.group(1).strip()
#     print(re_text)
# else:
#     print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> 
#INSTRUCTIONS
Describe a mountain track taking exact numbers from the input. Estimate track's difficulty based on track duration, distance (<1500m is easy, >10000m is hard) and ascent (<250m is easy).
#INPUT
{
"id":"relation/129319"
"ascent":"88",
"descent":null,
"distance": "10200"
"ref":"420A"
"route":"hiking"
}
#OUTPUT
For a substantial hike, choose Track 420A covering 10.2 kilometers with an 88-meter ascent and no descent. The estimated difficulty is moderate. Operated by Società degli Alpinisti Tridentini.</s>
