In [1]:
import os
os.environ["HF_HUB_CACHE"] = "/opt/dlami/nvme"
import pickle
import torch
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, DatasetDict
from trl import SFTConfig, SFTTrainer
import multiprocessing

[2024-07-12 11:38:43,235] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
ds = load_dataset("teknium/OpenHermes-2.5", split="train")
ds = ds.train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 801240
    })
    test: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 200311
    })
})

In [3]:
def sample_dataset(dataset_dict, percentage=0.05):
    sampled_dataset = DatasetDict()
    for split in dataset_dict.keys():
        # Calculate the number of samples to select
        sample_size = int(len(dataset_dict[split]) * percentage)
        sampled_dataset[split] = dataset_dict[split].shuffle(seed=42).select(range(sample_size))
    return sampled_dataset


In [4]:
sampled_dataset = sample_dataset(ds, 0.05)
sampled_dataset

DatasetDict({
    train: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 40062
    })
    test: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 10015
    })
})

In [5]:
def apply_list_clean(example):
    for conversation in example["conversations"]:
        if conversation.get("from") == "human":
            conversation["role"] = "user"
        elif conversation.get("from") == "gpt":
            conversation["role"] = "assistant"
        if "from" in conversation:
            del conversation["from"]
        conversation["content"] = conversation.pop("value")
    return example


In [6]:
def apply_template(example):
    example["text"] = tokenizer.apply_chat_template(example['conversations'], tokenize=False, add_generation_prompt=True)
    return example

In [7]:
num_cpus = multiprocessing.cpu_count()

In [8]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
sampled_dataset = sampled_dataset.map(apply_list_clean, num_proc=num_cpus)
sampled_dataset

Map (num_proc=192):   0%|          | 0/40062 [00:00<?, ? examples/s]

Map (num_proc=192):   0%|          | 0/10015 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 40062
    })
    test: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source'],
        num_rows: 10015
    })
})

In [10]:
sampled_dataset["train"][20]

{'custom_instruction': None,
 'topic': None,
 'model_name': None,
 'model': None,
 'skip_prompt_formatting': None,
 'category': None,
 'conversations': [{'content': 'You are an AI assistant. You will be given a task. You must generate a detailed and long answer.',
   'role': None,
   'weight': None},
  {'content': "Aucune référence à une base de données externe en rapport avec la sécurité n'est actuellement disponible.\n\nCould you please translate this to English?",
   'role': 'user',
   'weight': 0.0},
  {'content': 'No reference to an external database related to security is currently available.\n\nThe given sentence in French is translated to English as the statement mentioned above. It states that there is no available reference to any external database that deals with security matters at the moment.',
   'role': 'assistant',
   'weight': 1.0}],
 'views': None,
 'language': None,
 'id': None,
 'title': None,
 'idx': None,
 'hash': None,
 'avatarUrl': None,
 'system_prompt': None,


In [11]:
sampled_dataset = sampled_dataset.map(apply_template, num_proc=num_cpus)
sampled_dataset

Map (num_proc=192):   0%|          | 0/40062 [00:00<?, ? examples/s]

Map (num_proc=192):   0%|          | 0/10015 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source', 'text'],
        num_rows: 40062
    })
    test: Dataset({
        features: ['custom_instruction', 'topic', 'model_name', 'model', 'skip_prompt_formatting', 'category', 'conversations', 'views', 'language', 'id', 'title', 'idx', 'hash', 'avatarUrl', 'system_prompt', 'source', 'text'],
        num_rows: 10015
    })
})

In [12]:
sampled_dataset["train"][20]

{'custom_instruction': None,
 'topic': None,
 'model_name': None,
 'model': None,
 'skip_prompt_formatting': None,
 'category': None,
 'conversations': [{'content': 'You are an AI assistant. You will be given a task. You must generate a detailed and long answer.',
   'role': None,
   'weight': None},
  {'content': "Aucune référence à une base de données externe en rapport avec la sécurité n'est actuellement disponible.\n\nCould you please translate this to English?",
   'role': 'user',
   'weight': 0.0},
  {'content': 'No reference to an external database related to security is currently available.\n\nThe given sentence in French is translated to English as the statement mentioned above. It states that there is no available reference to any external database that deals with security matters at the moment.',
   'role': 'assistant',
   'weight': 1.0}],
 'views': None,
 'language': None,
 'id': None,
 'title': None,
 'idx': None,
 'hash': None,
 'avatarUrl': None,
 'system_prompt': None,
