In [None]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [None]:
class RiddleDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data['text'] = self.data.apply(lambda row: f"Riddle: {row['Riddle']}", axis=1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors="pt")
        input_ids = encoding.input_ids.squeeze()
        attention_mask = encoding.attention_mask.squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

In [None]:
def main():
    model_name = "TinyLlama/TinyLlama_v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    dataset = RiddleDataset("riddlefactory.csv", tokenizer)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # training_args = TrainingArguments(
    #     output_dir="./results",
    #     num_train_epochs=5,
    #     per_device_train_batch_size=4,
    #     per_device_eval_batch_size=4,
    #     evaluation_strategy="epoch",
    #     logging_steps=1,
    #     save_strategy="epoch",
    #     weight_decay=0.01,
    #     learning_rate=5e-5,
    #     optim="adamw_torch",
    #     fp16=True if torch.cuda.is_available() else False,
    #     load_best_model_at_end=True,
    #     metric_for_best_model="loss",
    #     greater_is_better=False,
    #     seed=42,
    # )
    training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_steps=1,
    save_strategy="epoch",
    weight_decay=0.1,
    learning_rate=3e-5,
    warmup_steps=50,
    fp16=True if torch.cuda.is_available() else False,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    seed=42,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    model.save_pretrained("./fine_tuned_tinyllama")
    tokenizer.save_pretrained("./fine_tuned_tinyllama")



In [None]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[34m[1mwandb[0m: Currently logged in as: [33mbuzzgrewal[0m ([33mbuzzgrewal-fast[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,11.1542,11.143553
2,11.1884,11.143553
3,11.2723,11.143553
4,11.1687,11.143553
5,10.9717,11.143553


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:
def test_model_on_riddle_prompts(model_path="./fine_tuned_tinyllama"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.eval()

    prompts = [
        "Riddle:",
        "Generate a math riddle:",
        "Math Puzzle:",
        "Write a new math riddle:",
        "Create a math riddle with its answer:"
    ]

    for i, prompt in enumerate(prompts):
        encoding = tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=50)
        input_ids = encoding.input_ids
        attention_mask = encoding.attention_mask

        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"Prompt {i+1}:\n{prompt}")
        print(f"Generated Riddle:\n{generated_text}\n{'-'*50}")

if __name__ == "__main__":
    test_model_on_riddle_prompts()


Prompt 1:
Riddle:
Generated Riddle:
Riddle: F  Y   I   I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I  I 
--------------------------------------------------
Prompt 2:
Generate a math riddle:
Generated Riddle:
Generate a math riddle: R Dll."" R.""." R."." "." R."."" R " ."  " r." ." r"." r"." r" R R " R R." R " R R"." R r
--------------------------------------------------
Prompt 3:
Math Puzzle:
Generated Riddle:
Math Puzzle: 3.132.313.22 00:12:31:16 2009-12-03
00017820 20:1
--------------------------------------------------
Prompt 4:
Write a new math riddle:
Generated Riddle:
Write a new math riddle: 2nd writing 2nd going writing 3 nd writing 2nd going writing 2ndgoing writing 2nd reading 2nd going 3nd nd 2nd going nd 2ndgoing nd 
--------------------------------------------------
Prompt 5:
Create a math riddle with its answer:
Generated Riddle:
Create a math riddle with its answer: The young m being.
The book itself is well done, I did find the cover to be very odd to me, b

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def test_model(model_path="./fine_tuned_tinyllama", prompt="Riddle:", num_riddles=5):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.eval()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    outputs = model.generate(
        input_ids=input_ids,
        max_length=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_riddles,
        pad_token_id=tokenizer.eos_token_id
    )


    for i, output in enumerate(outputs):
        generated_text = tokenizer.decode(output, skip_special_tokens=True)
        print(f"Generated Riddle {i+1}:\n{generated_text}\n{'-'*50}")

if __name__ == "__main__":
    test_model()


Generated Riddle 1:
Riddle: J Row R R Row RR R R R R R R R RR R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
--------------------------------------------------
Generated Riddle 2:
Riddle: Rangerle Danger Danger Danger D A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A
--------------------------------------------------
Generated Riddle 3:
Riddle: Ranger.
At the first-time meeting of the new 150th Assembly District, which includes all of the Town of Haverstraw, the district's new chairwoman said her goal for the
--------------------------------------------------
Generated Riddle 4:
Riddle:  Iam.
Amazon.com : BLUE TAILS FOR BOYS BLUE AND GREY SWEATSHIRT BOYS CLOTHING.
--------------------------------------------------
Generated Riddle 5:
Riddle:
All the above information is correct. However, you can contact your doctor if you have any concerns.
I've just had a coughing fit. What should I do now?
If you've just had
---------------------------------------

As the dataset is very small and the model is not fine tuned enough to generate any senseable riddles with a Training loss of around 10. So the riddles are not making any sense and even they are not riddles.

In [None]:
!zip -r Numenigma_tinyLlama.zip ./fine_tuned_tinyllama

  adding: fine_tuned_tinyllama/ (stored 0%)
  adding: fine_tuned_tinyllama/generation_config.json (deflated 29%)
  adding: fine_tuned_tinyllama/tokenizer_config.json (deflated 68%)
  adding: fine_tuned_tinyllama/tokenizer.json (deflated 85%)
  adding: fine_tuned_tinyllama/model.safetensors (deflated 7%)
  adding: fine_tuned_tinyllama/config.json (deflated 49%)
  adding: fine_tuned_tinyllama/tokenizer.model (deflated 55%)
  adding: fine_tuned_tinyllama/special_tokens_map.json (deflated 73%)


In [1]:
from IPython.display import FileLink
FileLink(r'Numenigma_tinyLlama.zip')