### Modules installation

In [None]:
%%bash
pip install accelerate -Uqqq --progress-bar off
pip install transformers -Uqqq --progress-bar off
pip install einops -Uqqq --progress-bar off
pip install datasets -Uqqq --progress-bar off
pip install peft -Uqqq --progress-bar off
pip install bitsandbytes -Uqqq --progress-bar off
pip install huggingface_hub -Uqqq --progress-bar off

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Code

### Imports

In [None]:
import torch
import os
import random
import json
import re

from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

### Finetuning

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

In [None]:
print(tokenizer.eos_token)

<|endoftext|>


In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 1,422,989,312 || trainable%: 0.3315971497613047


In [None]:
def tokenize(sample):
    model_inps = tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

with open("/content/drive/My Drive/dataset_eng.json", 'r') as fp:
    data = json.load(fp)

    # Randomly select keys from objects, "distance is mandatory"
    new_tracks = []

    for entry in data["tracks"]:
        random_keys = random.sample([key for key in entry.keys() if key != "distance"], 3)

        new_entry = {
            "distance": entry["distance"],
            "ascent": entry["ascent"]
        }
        new_entry.update({key: entry[key] for key in random_keys})

        new_tracks.append(new_entry)

    data["tracks"] = new_tracks

    dataset = Dataset.from_dict(data).to_pandas()
    display(dataset)

Unnamed: 0,tracks,descriptions
0,"{'ascent': '88', 'descent': '9', 'distance': '...",Beginner-friendly hike managed by Società degl...
1,"{'ascent': '405', 'descent': '231', 'distance'...","Challenge yourself on Track 421, a 12.04-kilom..."
2,"{'ascent': '153', 'descent': '0', 'distance': ...",Quick and easy hike on Track 421A covering 880...
3,"{'ascent': '270', 'descent': '0', 'distance': ...",Società degli Alpinisti Tridentini manages thi...
4,"{'ascent': '553', 'descent': '0', 'distance': ...","For intermediate hikers, Track 403 offers a 4...."
...,...,...
95,"{'ascent': '314', 'descent': '0', 'distance': ...","Track 175 offers a short and easy hike, coveri..."
96,"{'ascent': None, 'descent': None, 'distance': ...",Limited information available for Track 149. I...
97,"{'ascent': '883', 'descent': '0', 'distance': ...",Track 439 provides a moderately challenging hi...
98,"{'ascent': '565', 'descent': None, 'distance':...","Track 195, operated by C.A.I., offers a 2-kilo..."


In [None]:
dataset["text"] = dataset[["tracks", "descriptions"]].apply(lambda x:
                                                            f'''
                                                            #INPUT
                                                            {json.dumps(x["tracks"])}
                                                            #OUTPUT
                                                            {x["descriptions"]}''', axis=1)
display(dataset)

Unnamed: 0,tracks,descriptions,text
0,"{'ascent': '88', 'descent': '9', 'distance': '...",Beginner-friendly hike managed by Società degl...,\n ...
1,"{'ascent': '405', 'descent': '231', 'distance'...","Challenge yourself on Track 421, a 12.04-kilom...",\n ...
2,"{'ascent': '153', 'descent': '0', 'distance': ...",Quick and easy hike on Track 421A covering 880...,\n ...
3,"{'ascent': '270', 'descent': '0', 'distance': ...",Società degli Alpinisti Tridentini manages thi...,\n ...
4,"{'ascent': '553', 'descent': '0', 'distance': ...","For intermediate hikers, Track 403 offers a 4....",\n ...
...,...,...,...
95,"{'ascent': '314', 'descent': '0', 'distance': ...","Track 175 offers a short and easy hike, coveri...",\n ...
96,"{'ascent': None, 'descent': None, 'distance': ...",Limited information available for Track 149. I...,\n ...
97,"{'ascent': '883', 'descent': '0', 'distance': ...",Track 439 provides a moderately challenging hi...,\n ...
98,"{'ascent': '565', 'descent': None, 'distance':...","Track 195, operated by C.A.I., offers a 2-kilo...",\n ...


In [None]:
data = Dataset.from_pandas(dataset)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

Tokenizing data:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100
})

In [None]:
training_args = TrainingArguments(
    output_dir="phi-1_5-geodata-finetuning-eng-1000",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=25,
    max_steps=1000,
    num_train_epochs=1,
    push_to_hub=True
)



trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
trainer.push_to_hub()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.3048
50,0.8699
75,0.6228
100,0.5418
125,0.4949
150,0.4562
175,0.4286
200,0.4005
225,0.3795
250,0.3531


'https://huggingface.co/andrea-coppari/phi-1_5-geodata-finetuning-eng-1000/tree/main/'

### Saving and Uploading the model

In [None]:
import torch

from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True,
                                             torch_dtype=torch.float32)
peft_model = PeftModel.from_pretrained(model, "andrea-coppari/phi-1_5-geodata-finetuning-eng-1000", from_transformers=True)
model = peft_model.merge_and_unload()

adapter_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

### Inference

In [None]:
import torch
import re

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained("andrea-coppari/phi-1_5-geodata-finetuning-eng-1000", trust_remote_code=True, torch_dtype=torch.float32).to(device)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

In [None]:
# Esempio fuori dal dataset
inputs = tokenizer(
    '''
#INSTRUCTIONS
Describe a mountain track taking exact numbers from the input. Estimate track's difficulty based on track duration, distance (<1500m is easy, >10000m is hard) and ascent (<1500m is easy, >10000m is hard). Mention every possible information
#INPUT
{
        "id": "relation/10",
        "ascent": "6000",
        "descent"; null,
        "distance": "13898",
        "duration:forward": "12:00",
        "ref": "2000B"
    }
#OUTPUT
''',
    return_tensors="pt",
    return_attention_mask=False
).to(device)

from time import time
t = time()
outputs = model.generate(**inputs, max_length=300).to(device)
# print(f'Time to generate a message of 256 tokens: {time()-t:.2f} seconds')
text = tokenizer.batch_decode(outputs)[0]

pattern = re.compile(r'#OUTPUT(.*?)#', re.DOTALL)

match = pattern.search(text)
if match:
    re_text = match.group(1).strip()
    print(re_text)
else:
    print(text)

Track 2000B is a moderate 10,000-meter hike with a short duration of 12 hours. Managed by Società degli Alpinisti Tridentini, it involves a moderate ascent of 6000 meters. Recommended for intermediate hikers.


In [None]:
# Esempio fuori dal dataset
inputs = tokenizer(
    '''
#INPUT
{
        "id": "relation/10",
        "ascent": "6000",
        "distance": "13898",
        "duration:forward": "12:00",
    }
#OUTPUT
''',
    return_tensors="pt",
    return_attention_mask=False
).to(device)

from time import time
t = time()
outputs = model.generate(**inputs, max_length=256).to(device)
# print(f'Time to generate a message of 256 tokens: {time()-t:.2f} seconds')
text = tokenizer.batch_decode(outputs)[0]

pattern = re.compile(r'#OUTPUT(.*?)#', re.DOTALL)

match = pattern.search(text)
if match:
    re_text = match.group(1).strip()
    print(re_text)
else:
    print(text)

You should plan a TrackSEE 6+ adventure with Società degli Alpinisti Tridentini. The specified track has a 6.08-meter long route with a 6000-meter distance. The estimated duration is 1 hour and 12 minutes.
}


In [None]:
# Esempio fuori dal dataset
inputs = tokenizer(
    '''
#INSTRUCTIONS
Produce an accurate description of a mountain track, taking exact numbers from the given json object in input. The output should also contain an evaluation on track's difficulty, based on track duration, distance and ascent
#INPUT
{
        "id": "relation/7428977",
        "distance": "1710",
        "ascent": "220"
        "route": "hiking"
    }
#OUTPUT
''',
    return_tensors="pt",
    return_attention_mask=False
).to(device)

from time import time
t = time()
outputs = model.generate(**inputs, max_length=256).to(device)
# print(f'Time to generate a message of 256 tokens: {time()-t:.2f} seconds')
text = tokenizer.batch_decode(outputs)[0]

pattern = re.compile(r'#OUTPUT(.*?)#', re.DOTALL)

match = pattern.search(text)
if match:
    re_text = match.group(1).strip()
    print(re_text)
else:
    print(text)

The Track 47228977 is a moderately difficult hike covering 17.1 kilometers with a 220-meter ascent. It is recommended for experienced hikers seeking a balanced challenge.
