## Setup

In [1]:
# Make sure transformers is up to date!

!pip install --upgrade transformers



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset,concatenate_datasets,DatasetDict,load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


## Load model

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-zephyr-3b')
model = AutoModelForCausalLM.from_pretrained(
    'stabilityai/stablelm-zephyr-3b',
    device_map="auto",
    trust_remote_code=True
)

# Test from huggingface's website:
prompt = [{'role': 'user', 'content': 'List 3 synonyms for the word "tiny"'}]
inputs = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt=True,
    return_tensors='pt'
)

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=1024,
    temperature=0.8,
    do_sample=True
)

print(tokenizer.decode(tokens[0], skip_special_tokens=False))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|user|>
List 3 synonyms for the word "tiny"<|endoftext|>
<|assistant|>
1. tomity 
2. miniaturity 
3. petiteness<|endoftext|>


In [84]:
tokenizer.model_max_length

2048

## Load datasets

### 1. math_dataset from huggingface

In [2]:
math_dataset_options = ['algebra__linear_1d', 'algebra__linear_1d_composed', 'algebra__linear_2d', 'algebra__linear_2d_composed', 'algebra__polynomial_roots', 'algebra__polynomial_roots_composed', 'algebra__sequence_next_term', 'algebra__sequence_nth_term', 'arithmetic__add_or_sub','arithmetic__add_or_sub_in_base', 'arithmetic__add_sub_multiple', 'arithmetic__div', 'arithmetic__mixed', 'arithmetic__mul', 'arithmetic__mul_div_multiple', 'arithmetic__nearest_integer_root', 'arithmetic__simplify_surd', 'calculus__differentiate', 'calculus__differentiate_composed', 'comparison__closest', 'comparison__closest_composed', 'comparison__kth_biggest', 'comparison__kth_biggest_composed', 'comparison__pair', 'comparison__pair_composed', 'comparison__sort', 'comparison__sort_composed', 'measurement__conversion', 'measurement__time', 'numbers__base_conversion', 'numbers__div_remainder', 'numbers__div_remainder_composed', 'numbers__gcd', 'numbers__gcd_composed', 'numbers__is_factor', 'numbers__is_factor_composed', 'numbers__is_prime', 'numbers__is_prime_composed', 'numbers__lcm', 'numbers__lcm_composed', 'numbers__list_prime_factors', 'numbers__list_prime_factors_composed', 'numbers__place_value', 'numbers__place_value_composed', 'numbers__round_number', 'numbers__round_number_composed', 'polynomials__add', 'polynomials__coefficient_named', 'polynomials__collect', 'polynomials__compose', 'polynomials__evaluate', 'polynomials__evaluate_composed', 'polynomials__expand', 'polynomials__simplify_power', 'probability__swr_p_level_set', 'probability__swr_p_sequence']
len(math_dataset_options)

56

In [29]:
# math_dataset_options = ['algebra__linear_1d', 'algebra__linear_1d_composed', 'algebra__linear_2d', 'algebra__linear_2d_composed', 'algebra__polynomial_roots', 'algebra__polynomial_roots_composed', 'algebra__sequence_next_term', 'algebra__sequence_nth_term', 'arithmetic__add_or_sub','arithmetic__add_or_sub_in_base', 'arithmetic__add_sub_multiple', 'arithmetic__div', 'arithmetic__mixed', 'arithmetic__mul', 'arithmetic__mul_div_multiple', 'arithmetic__nearest_integer_root', 'arithmetic__simplify_surd', 'calculus__differentiate', 'calculus__differentiate_composed', 'comparison__closest', 'comparison__closest_composed', 'comparison__kth_biggest', 'comparison__kth_biggest_composed', 'comparison__pair', 'comparison__pair_composed', 'comparison__sort', 'comparison__sort_composed', 'measurement__conversion', 'measurement__time', 'numbers__base_conversion', 'numbers__div_remainder', 'numbers__div_remainder_composed', 'numbers__gcd', 'numbers__gcd_composed', 'numbers__is_factor', 'numbers__is_factor_composed', 'numbers__is_prime', 'numbers__is_prime_composed', 'numbers__lcm', 'numbers__lcm_composed', 'numbers__list_prime_factors', 'numbers__list_prime_factors_composed', 'numbers__place_value', 'numbers__place_value_composed', 'numbers__round_number', 'numbers__round_number_composed', 'polynomials__add', 'polynomials__coefficient_named', 'polynomials__collect', 'polynomials__compose', 'polynomials__evaluate', 'polynomials__evaluate_composed', 'polynomials__expand', 'polynomials__simplify_power', 'probability__swr_p_level_set', 'probability__swr_p_sequence']
math_dataset_options=['algebra__linear_1d','algebra__linear_2d']
math_dataset={}

for option in math_dataset_options:
    math_dataset[option] = load_dataset("math_dataset", option, split='train')
    # if "validation" not in math_dataset[option].keys():
    #     num_examples = math_dataset[option]['train'].num_rows
    #     tmp = math_dataset[option]['train'].train_test_split(test_size=(int(num_examples*0.1))/num_examples)
    #     math_dataset[option]['train'] = tmp['train']
    #     math_dataset[option]['validation'] = tmp['test']
    #     math_dataset[option].pop('test')

In [5]:
math_dataset['algebra__linear_1d']

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1799999
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 199999
    })
})

In [6]:
math_dataset['algebra__linear_1d']['train'][3]

{'question': "b'Solve -2054 - 13213 = -345*c + 18888 for c.\\n'",
 'answer': "b'99\\n'"}

### 2. MATH dataset

https://huggingface.co/datasets/hendrycks/competition_math

In [10]:
competition_math_dataset = load_dataset("competition_math")
competition_math_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 5000
    })
})

In [11]:
tmp=competition_math_dataset['test'].train_test_split(test_size=0.5)
competition_math_dataset['validation']=tmp['test']
competition_math_dataset['test']=tmp['test']

In [12]:
competition_math_dataset['train'][3]

{'problem': 'Evaluate $\\left\\lceil3\\left(6-\\frac12\\right)\\right\\rceil$.',
 'level': 'Level 3',
 'type': 'Algebra',
 'solution': 'Firstly, $3\\left(6-\\frac12\\right)=18-1-\\frac12=17-\\frac12$.  Because $0\\le\\frac12<1$, we have $\\left\\lceil17-\\frac12\\right\\rceil=\\boxed{17}$.'}

In [13]:
# for competition_math_dataset, change key "problem" to "question" and "solution" to "answer"
competition_math_dataset = competition_math_dataset.rename_column("problem", "question")
competition_math_dataset = competition_math_dataset.rename_column("solution", "answer")
# if "validation" not in competition_math_dataset.keys():
#         num_examples = competition_math_dataset['train'].num_rows
#         tmp = competition_math_dataset['train'].train_test_split(test_size=(int(num_examples*0.1))/num_examples)
#         competition_math_dataset['train'] = tmp['train']
#         competition_math_dataset['validation'] = tmp['test']
#         competition_math_dataset.pop('test')

In [14]:
competition_math_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'level', 'type', 'answer'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['question', 'level', 'type', 'answer'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['question', 'level', 'type', 'answer'],
        num_rows: 2500
    })
})

In [15]:
competition_math_dataset['train'][3]

{'question': 'Evaluate $\\left\\lceil3\\left(6-\\frac12\\right)\\right\\rceil$.',
 'level': 'Level 3',
 'type': 'Algebra',
 'answer': 'Firstly, $3\\left(6-\\frac12\\right)=18-1-\\frac12=17-\\frac12$.  Because $0\\le\\frac12<1$, we have $\\left\\lceil17-\\frac12\\right\\rceil=\\boxed{17}$.'}

### 3. Stanford Question Answering Dataset (SQuAD)

https://huggingface.co/datasets/rajpurkar/squad

https://huggingface.co/datasets/rajpurkar/squad_v2

In [85]:
squad_dataset = load_dataset("squad")

In [86]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [88]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5285
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5285
    })
})

In [89]:
squad_dataset['train'][3]

{'id': '5733be284776f41900661181',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'What is the Grotto at Notre Dame?',
 'answers': {'text': ['a Marian place of prayer and reflection'],
  'answer_start': [381]}}

In [90]:
squad_dataset=squad_dataset.map(lambda x: {"question": x["question"]+"\n"+x["context"], "answer": x["answers"]["text"][0]})
# num_examples = squad_dataset['train'].num_rows
# tmp = squad_dataset['train'].train_test_split(test_size=(int(num_examples*0.1))/num_examples)
# squad_dataset['train'] = tmp['train']
# squad_dataset['validation'] = tmp['test']

Map: 100%|██████████| 5285/5285 [00:00<00:00, 6428.81 examples/s]


In [14]:
squad_dataset['train'][3]

{'id': '56dfe78e7aa994140058e24a',
 'title': 'Pub',
 'context': "Most British pubs still have decorated signs hanging over their doors, and these retain their original function of enabling the identification of the pub. Today's pub signs almost always bear the name of the pub, both in words and in pictorial representation. The more remote country pubs often have stand-alone signs directing potential customers to their door.",
 'question': "What piece of information is almost always listed on a pub sign?\nMost British pubs still have decorated signs hanging over their doors, and these retain their original function of enabling the identification of the pub. Today's pub signs almost always bear the name of the pub, both in words and in pictorial representation. The more remote country pubs often have stand-alone signs directing potential customers to their door.",
 'answers': {'text': ['the name of the pub'], 'answer_start': [192]},
 'answer': 'the name of the pub'}

In [16]:
squad_v2_dataset = load_dataset("squad_v2")

In [17]:
tmp=squad_v2_dataset['validation'].train_test_split(test_size=0.5)
squad_v2_dataset['validation']=tmp['test']
squad_v2_dataset['test']=tmp['test']

In [18]:
squad_v2_dataset['train'][2075]

{'id': '5a8d7bf7df8bba001a0f9ab1',
 'title': 'The_Legend_of_Zelda:_Twilight_Princess',
 'context': 'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]',
 'question': 'What category of game is Legend of Zelda: Australia Twilight?',
 'answers': {'text': [], 'answer_start': []}}

In [19]:
# clean all data whihc do not have answer(answer[text] is empty)
squad_v2_dataset = squad_v2_dataset.filter(lambda x: x["answers"]["text"] != [])
squad_v2_dataset=squad_v2_dataset.map(lambda x: {"question": x["question"]+"\n"+x["context"], "answer": x["answers"]["text"][0]})
# num_examples = squad_v2_dataset['train'].num_rows
# tmp = squad_v2_dataset['train'].train_test_split(test_size=(int(num_examples*0.1))/num_examples)
# squad_v2_dataset['train'] = tmp['train']
# squad_v2_dataset['validation'] = tmp['test']

Filter:   0%|          | 0/5937 [00:00<?, ? examples/s]

Filter: 100%|██████████| 5937/5937 [00:00<00:00, 17452.71 examples/s]
Map: 100%|██████████| 2950/2950 [00:00<00:00, 6843.42 examples/s]


In [20]:
squad_v2_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'answer'],
        num_rows: 86821
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'answer'],
        num_rows: 2950
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'answer'],
        num_rows: 2950
    })
})

In [21]:
squad_v2_dataset['train'][3]

{'id': '56bf6b0f3aeaaa14008c9601',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'In what city and state did Beyonce  grow up? \nBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Hous

### 4. Science Questions Answering (SciQ)

https://huggingface.co/datasets/allenai/sciq

In [22]:
sciq_dataset = load_dataset("sciq")

In [23]:
sciq_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

In [24]:
sciq_dataset['train'][3]

{'question': 'What is the least dangerous radioactive decay?',
 'distractor3': 'zeta decay',
 'distractor1': 'beta decay',
 'distractor2': 'gamma decay',
 'correct_answer': 'alpha decay',
 'support': 'All radioactive decay is dangerous to living things, but alpha decay is the least dangerous.'}

In [25]:
# for sciq_dataset, combine "correct answer" and "support" in a single key "answer"
sciq_dataset = sciq_dataset.map(lambda x: {"question": x["question"], "answer": x["correct_answer"] + ". " + x["support"]})

In [26]:
sciq_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer'],
        num_rows: 1000
    })
})

In [22]:
sciq_dataset['train'][3]

{'question': 'What is the least dangerous radioactive decay?',
 'distractor3': 'zeta decay',
 'distractor1': 'beta decay',
 'distractor2': 'gamma decay',
 'correct_answer': 'alpha decay',
 'support': 'All radioactive decay is dangerous to living things, but alpha decay is the least dangerous.',
 'answer': 'alpha decay. All radioactive decay is dangerous to living things, but alpha decay is the least dangerous.'}

### 5. AI2 Reasoning Challenge (ARC)

https://huggingface.co/datasets/allenai/ai2_arc

In [27]:
arc_subset=['ARC-Challenge','ARC-Easy']
arc_dataset={}
for subset in arc_subset:
    arc_dataset[subset] = load_dataset("ai2_arc", subset)

In [28]:
arc_dataset['ARC-Challenge']['train'][3]

{'id': 'Mercury_7041615',
 'question': 'Which of these do scientists offer as the most recent explanation as to why many plants and animals died out at the end of the Mesozoic era?',
 'choices': {'text': ['worldwide disease',
   'global mountain building',
   'rise of mammals that preyed upon plants and animals',
   'impact of an asteroid created dust that blocked the sunlight'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'D'}

In [29]:
def reformat_item(item):
    question = item['question']
    choices = item['choices']['text']
    labels = item['choices']['label']
    formatted_question = f"{question}\n"
    for label, choice in zip(labels, choices):
        formatted_question += f"{label}. {choice}\n"
    item['question'] = formatted_question.strip()  # Update the question field
    item['answer'] = item['answerKey'] + ". " + choices[labels.index(item['answerKey'])]
    return item

for subset in arc_subset:
    arc_dataset[subset] = arc_dataset[subset].map(reformat_item)


In [30]:
arc_dataset['ARC-Challenge']

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 1119
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 1172
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 299
    })
})

In [31]:
arc_dataset['ARC-Easy']

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 2376
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'answer'],
        num_rows: 570
    })
})

In [32]:
arc_dataset['ARC-Challenge']['train'][3]

{'id': 'Mercury_7041615',
 'question': 'Which of these do scientists offer as the most recent explanation as to why many plants and animals died out at the end of the Mesozoic era?\nA. worldwide disease\nB. global mountain building\nC. rise of mammals that preyed upon plants and animals\nD. impact of an asteroid created dust that blocked the sunlight',
 'choices': {'text': ['worldwide disease',
   'global mountain building',
   'rise of mammals that preyed upon plants and animals',
   'impact of an asteroid created dust that blocked the sunlight'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'D',
 'answer': 'D. impact of an asteroid created dust that blocked the sunlight'}

### 6. wikipedia dataset (for RAG)

### 7. EPFL preference pairs dataset (for DPO)

## Preprocessing for MCQ

We want to put our datasets in this format:

In [3]:
example_instance = {"subject": "machine_learning", "question": "Question: MLE estimates are often undesirable because?\n\nOptions:\nA. they are biased\nB. they have high variance\nC. they are not consistent estimators\nD. None of the above\n\nAnswer:", "answer": "B"}
print(example_instance)
print(example_instance['question'])

{'subject': 'machine_learning', 'question': 'Question: MLE estimates are often undesirable because?\n\nOptions:\nA. they are biased\nB. they have high variance\nC. they are not consistent estimators\nD. None of the above\n\nAnswer:', 'answer': 'B'}
Question: MLE estimates are often undesirable because?

Options:
A. they are biased
B. they have high variance
C. they are not consistent estimators
D. None of the above

Answer:


In [21]:
def preprocess_function(examples):
    questions = examples['question']
    answers = examples['answer']
    inputs = tokenizer(questions, truncation=True, padding=True, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(answers, truncation=True, padding=True, return_tensors="pt")
    inputs['labels'] = labels['input_ids']
    return inputs


In [None]:
tokenized_datasets = math_dataset.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

## Aggregate into single MCQ dataset

## Preprocess for SFT

In [34]:
def create_conversation(sample):
  return {
    "messages": [
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

In [35]:
competition_math_dataset_chatformate = competition_math_dataset.map(create_conversation, remove_columns=competition_math_dataset['train'].features,batched=False)
final_dataset = competition_math_dataset_chatformate.shuffle()
print(final_dataset)

datasets_list = []
sciq_dataset_chatformate = sciq_dataset.map(create_conversation, remove_columns=sciq_dataset['train'].features,batched=False)
datasets_list.append(sciq_dataset_chatformate)

for key in ["train","validation","test"]:
    final_dataset[key] = concatenate_datasets([final_dataset[key], datasets_list[0][key]]).shuffle()

print(final_dataset)

datasets_list = []
datasets_list.append(final_dataset)

arc_dataset_chatformate={}
for subset in arc_subset:
    arc_dataset_chatformate[subset] = arc_dataset[subset].map(create_conversation, remove_columns=arc_dataset[subset]['train'].features,batched=False)
datasets_list.extend(list(arc_dataset_chatformate.values()))
for key in ["train","validation","test"]:
    final_dataset[key] = concatenate_datasets([dataset[key] for dataset in datasets_list]).shuffle()
print(final_dataset)

datasets_list = []
squad_v2_dataset_chatformate = squad_v2_dataset.map(create_conversation, remove_columns=squad_v2_dataset['train'].features,batched=False)
datasets_list.append(squad_v2_dataset_chatformate)

for key in ["train","validation","test"]:
    final_dataset[key] = concatenate_datasets([final_dataset[key], datasets_list[0][key]]).shuffle()
print(final_dataset)


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map: 100%|██████████| 2500/2500 [00:00<00:00, 9239.77 examples/s]


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 2500
    })
})
DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 19179
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 3500
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 3500
    })
})
DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 22549
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 7048
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 4369
    })
})


Map: 100%|██████████| 2950/2950 [00:00<00:00, 14230.36 examples/s]

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 109370
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 9998
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 7319
    })
})





In [37]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 109370
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 9998
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 7319
    })
})

In [36]:
final_dataset['train'].to_json('sft_train_dataset.json', orient="records")
final_dataset['validation'].to_json('sft_validation_dataset.json', orient="records")
final_dataset['test'].to_json('sft_test_dataset.json', orient="records")

Creating json from Arrow format:   0%|          | 0/110 [00:00<?, ?ba/s]

Creating json from Arrow format: 100%|██████████| 110/110 [00:02<00:00, 52.80ba/s]
Creating json from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 61.60ba/s]
Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 53.81ba/s]


6748124

In [None]:
i=0
for option in math_dataset_options:
    i+=1
    print("sub datset:",str(i)+" "+option)
    train_file_path = f"sft_train_dataset_{option}.json"
    validation_file_path = f"sft_validation_dataset_{option}.json"

    math_dataset_chatformate={}
    #take only 10% of the dataset
    math_dataset[option] = math_dataset[option].train_test_split(test_size=0.1)
    

    math_dataset_chatformate[option] = math_dataset[option].map(create_conversation, remove_columns=math_dataset[option]['train'].features,batched=False)
    math_dataset_chatformate[option]["train"].to_json(train_file_path, orient="records")
    math_dataset_chatformate[option]["validation"].to_json(validation_file_path, orient="records")

print("finished!")

In [5]:
i=0
for option in math_dataset_options:
    i+=1
    print("sub datset:",str(i)+" "+option)
    train_file_path = f"sft_train_dataset_{option}.json"
    validation_file_path = f"sft_validation_dataset_{option}.json"

    math_dataset_chatformate={}
    math_dataset_chatformate[option] = math_dataset[option].map(create_conversation, remove_columns=math_dataset[option]['train'].features,batched=False)
    math_dataset_chatformate[option]["train"].to_json(train_file_path, orient="records")
    math_dataset_chatformate[option]["validation"].to_json(validation_file_path, orient="records")

print("finished!")

sub datset: 1 algebra__linear_1d


Map:  13%|█▎        | 228689/1799999 [00:21<02:26, 10702.49 examples/s]


KeyboardInterrupt: 

In [38]:
final_dataset=load_dataset("json", data_files={"train": "SFT_data/sft_train_dataset.json", "validation": "SFT_data/sft_validation_dataset.json"})
final_dataset

Generating train split: 109370 examples [00:00, 392844.35 examples/s]
Generating validation split: 7319 examples [00:00, 399221.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 109370
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 7319
    })
})

In [4]:
final_dataset=load_dataset("json", data_files={"train": "sft_train_dataset.json", "validation": "sft_validation_dataset.json"})
i=0
for option in math_dataset_options:
    i+=1
    print("sub datset:",str(i)+" "+option+"/"+str(len(math_dataset_options)))
    train_file_path = f"sft_train_dataset_{option}.json"
    validation_file_path = f"sft_validation_dataset_{option}.json"
    math_dataset_chatformate_sub = load_dataset("json", data_files={"train": train_file_path, "validation": validation_file_path})
    for key in ["train","validation"]:
        final_dataset[key] = concatenate_datasets([final_dataset[key], math_dataset_chatformate_sub[key]]).shuffle()

final_dataset["train"].to_json("sft_train_dataset_all.json", orient="records")
final_dataset["validation"].to_json("sft_validation_dataset_all.json", orient="records")

sub datset: 1 algebra__linear_1d/56
sub datset: 2 algebra__linear_1d_composed/56
sub datset: 3 algebra__linear_2d/56


Generating train split: 1799999 examples [00:01, 1564835.52 examples/s]
Generating validation split: 199999 examples [00:00, 1517221.52 examples/s]


sub datset: 4 algebra__linear_2d_composed/56


Generating train split: 1799999 examples [00:01, 1175372.11 examples/s]
Generating validation split: 199999 examples [00:00, 1189261.93 examples/s]


sub datset: 5 algebra__polynomial_roots/56


Generating train split: 1799999 examples [00:01, 1390340.65 examples/s]
Generating validation split: 199999 examples [00:00, 1330019.51 examples/s]


sub datset: 6 algebra__polynomial_roots_composed/56


Generating train split: 1799999 examples [00:01, 1289591.24 examples/s]
Generating validation split: 199999 examples [00:00, 1110450.62 examples/s]


sub datset: 7 algebra__sequence_next_term/56


Generating train split: 1799999 examples [00:01, 1567706.03 examples/s]
Generating validation split: 199999 examples [00:00, 1550217.15 examples/s]


sub datset: 8 algebra__sequence_nth_term/56


Generating train split: 1799999 examples [00:01, 1252343.33 examples/s]
Generating validation split: 199999 examples [00:00, 1021385.31 examples/s]


sub datset: 9 arithmetic__add_or_sub/56


Generating train split: 1799999 examples [00:01, 1734000.94 examples/s]
Generating validation split: 199999 examples [00:00, 1595260.57 examples/s]


sub datset: 10 arithmetic__add_or_sub_in_base/56
sub datset: 11 arithmetic__add_sub_multiple/56
sub datset: 12 arithmetic__div/56
sub datset: 13 arithmetic__mixed/56
sub datset: 14 arithmetic__mul/56
sub datset: 15 arithmetic__mul_div_multiple/56
sub datset: 16 arithmetic__nearest_integer_root/56
sub datset: 17 arithmetic__simplify_surd/56
sub datset: 18 calculus__differentiate/56
sub datset: 19 calculus__differentiate_composed/56
sub datset: 20 comparison__closest/56
sub datset: 21 comparison__closest_composed/56
sub datset: 22 comparison__kth_biggest/56
sub datset: 23 comparison__kth_biggest_composed/56
sub datset: 24 comparison__pair/56
sub datset: 25 comparison__pair_composed/56
sub datset: 26 comparison__sort/56
sub datset: 27 comparison__sort_composed/56
sub datset: 28 measurement__conversion/56
sub datset: 29 measurement__time/56
sub datset: 30 numbers__base_conversion/56
sub datset: 31 numbers__div_remainder/56
sub datset: 32 numbers__div_remainder_composed/56
sub datset: 33 nu

Creating json from Arrow format: 100%|██████████| 101058/101058 [33:09<00:00, 50.80ba/s] 
Creating json from Arrow format: 100%|██████████| 11229/11229 [03:14<00:00, 57.62ba/s]


1852263434