<a href="https://colab.research.google.com/github/ashmibanerjee/cities-llm-project/blob/main/SFT_Trainer_FT_Gemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets trl

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.9.4-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata

In [3]:
HF_TOKEN = userdata.get("HF_TOKEN")

In [4]:

def get_dataset(dataset_name:str, data_files=None, is_public=True):
    if is_public:
        dataset = load_dataset(dataset_name, split="train")
    else:
        dataset = load_dataset(dataset_name, token=True, data_files=data_files)
    return dataset

In [5]:
def convert_pandas(dataset):
    df = dataset
    df.set_format(type='pandas')  ## converting it into pandas
    try:
        df = df[:]
    except KeyError:
        df = df["train"][:]
    return df

In [6]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts


In [7]:
dataset = get_dataset(dataset_name="RecSysTUM/europeancities-wikivoyage-tripadvisor", data_files="wikivoyage/q_a/cities_qa_merged.csv", is_public=False)
print(dataset)
df = convert_pandas(dataset)
df.head()

Downloading readme:   0%|          | 0.00/325 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['city', 'prompt', 'answer'],
        num_rows: 5285
    })
})


Unnamed: 0,city,prompt,answer
0,Belgrade,What is Belgrade known for?,"Belgrade is known for its vibrant nightlife, h..."
1,Belgrade,What is the history of Belgrade?,Belgrade has a long history dating back to the...
2,Belgrade,What is the climate like in Belgrade?,Belgrade has a temperate continental climate w...
3,Belgrade,What are the people like in Belgrade?,Belgradians are known for their friendliness a...
4,Belgrade,How can I get to Belgrade by plane?,Belgrade Nikola Tesla Airport (BEG) is the mai...


In [8]:
model_name = "google/gemma-2b"
# model_name = "facebook/opt-350m"

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [11]:
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
training_data = dataset["train"].train_test_split(test_size=0.3, shuffle=True)

trainer = SFTTrainer(
    model,
    train_dataset=training_data["train"],
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)




Map:   0%|          | 0/3699 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# trainer.train()
# trainer.save_model()

In [None]:
#    sft_config = SFTConfig(
#         max_seq_length=512,
#         output_dir="/tmp",
#     )