# RAFT

In [14]:
! pip install -r ../requirements.txt

[0m

## Synthetic data generation phase

### Select the documents

In [15]:
import pandas as pd
ds_name = "ucb-DEMO"
doc_path = "../sample_data/UC_Berkeley_short.pdf"
ds_path = f"dataset/{ds_name}"
print("Creating dataset: " + ds_name)

Creating dataset: ucb-DEMO


### Clean up the DEMO folder

In [16]:
# Clean up demo folder only if it's a DEMO dataset
if ds_path.endswith("DEMO"):
    import shutil
    print(f"Cleaning demo folder {ds_path}")
    shutil.rmtree(ds_path, ignore_errors=True)
    print(f"Cleaning demo checkpoints folder {ds_path}")
    shutil.rmtree(ds_path + "-checkpoints", ignore_errors=True)

Cleaning demo folder dataset/ucb-DEMO
Cleaning demo checkpoints folder dataset/ucb-DEMO


### Generate Q/A/CoT fine-tuning dataset using RAFT from the domain specific documents

In [17]:
! python3 ../raft.py \
    --datapath "$doc_path" \
    --output $ds_path \
    --distractors 3 \
    --doctype pdf \
    --chunk_size 512 \
    --questions 1 \
    --workers 2 \
    --system-prompt-key llama \
    --completion_model Meta-Llama-3-70B-Instruct \
    --embedding_model text-embedding-ada-002

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[32m2024-05-25 20:43:29[0m [1;30m INFO[0m [  0%] [34mraft[0m Using checkpoint chunks /workspaces/gorilla/raft/azure-ai-studio-ft/dataset/ucb-DEMO-checkpoints/chunks
[32m2024-05-25 20:43:29[0m [1;30m INFO[0m [  0%] [34mraft[0m Retrieving chunks from ../sample_data/UC_Berkeley_short.pdf of type pdf using the text-embedding-ada-002 model.
Chunking: 100%|██████████████████████| 1/1 [00:01<00:00,  1.09s/file, chunks=30]
[32m2024-05-25 20:43:30[0m [1;30m INFO[0m [  0%] [34mraft[0m Using system prompt key llama
[32m2024-05-25 20:43:30[0m [1;30m INFO[0m [  0%] [34mraft[0m Using 2 worker threads
Generating: 100%|█| 30/30 [03:20<00:00,  6.68s/chunk, qa=30, last tok/s=5.24e+4,
[32m2024-05-25 20:46:51[0m [1;30m INFO[0m [  0%] [34mraft[0m Consumed 24977 prompt tokens, 5491 completion tokens, 30468 total tokens
Sa

## Prepare training, validation and evaluation splits

In [18]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
dataset_path = f"{ds_path}-files/{ds_name}-full.jsonl"
dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"
dataset_path_ft_eval = f"{ds_path}-files/{ds_name}-ft.eval.jsonl"

print(f"Reading arrow file {raft_arrow_file}")

Reading arrow file dataset/ucb-DEMO/data-00000-of-00001.arrow


### Export dataset to JSONL

In [19]:
! python ../format.py \
    --input $raft_arrow_file \
    --output $dataset_path_hf \
    --output-format hf

Generating train split: 30 examples [00:00, 6341.87 examples/s]
[32m2024-05-25 20:46:53[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 30 rows
[32m2024-05-25 20:46:53[0m [1;30m INFO[0m [    ] [34mraft[0m Converting arrow file dataset/ucb-DEMO/data-00000-of-00001.arrow to jsonl hf file dataset/ucb-DEMO-files/ucb-DEMO-hf.full.jsonl
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 110.36ba/s]


In [20]:
hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(2)

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,933dc0d8-a33c-4630-808b-1737620eeed6,general,In what year was the Mathematical Sciences Res...,"{'sentences': [['Berdahl 2004–2013 Robert J.',...","In 1964, the Free\nSpeech Movement organized s...","To answer the question, we need to identify th...",<DOCUMENT>Berdahl\n2004–2013 Robert J.</DOCUME...
1,7bc23389-399c-4c45-9904-3caaeefd8e12,general,What was Berkeley's reported alumni giving rat...,"{'sentences': [['In 1952, the university reorg...",Berkeley had originally reported that its two-...,"To answer the question, we need to determine B...","<DOCUMENT>In 1952, the university\nreorganized..."


### Do the splitting

In [21]:
# split dataset into 80%/10%/10%
import numpy as np
samples_count = len(hf_full_df)
hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, [int(.8*samples_count), int(.9*samples_count)])
hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

  return bound(*args, **kwds)


### Export training and validation datasets into JSONL format

In [22]:
! python ../format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 24 examples [00:00, 3691.63 examples/s]
[32m2024-05-25 20:46:55[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 24 rows
[32m2024-05-25 20:46:55[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.train.jsonl to jsonl completion file dataset/ucb-DEMO-files/ucb-DEMO-ft.train.jsonl
Filter out empty examples: 100%|███████| 24/24 [00:00<00:00, 4299.46 examples/s]
Rename fields and add <STOP> token: 100%|█| 24/24 [00:00<00:00, 7705.40 examples
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 394.39ba/s]


In [23]:
! python ../format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 3 examples [00:00, 693.39 examples/s]
[32m2024-05-25 20:46:57[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 3 rows
[32m2024-05-25 20:46:57[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.valid.jsonl to jsonl completion file dataset/ucb-DEMO-files/ucb-DEMO-ft.valid.jsonl
Filter out empty examples: 100%|██████████| 3/3 [00:00<00:00, 673.13 examples/s]
Rename fields and add <STOP> token: 100%|█| 3/3 [00:00<00:00, 1232.17 examples/s
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 468.43ba/s]


### Export evaluation dataset into JSONL format

In [24]:
! python ../format.py \
    --input $dataset_path_hf_eval \
    --input-type jsonl \
    --output $dataset_path_ft_eval \
    --output-format eval

Generating train split: 3 examples [00:00, 725.41 examples/s]
[32m2024-05-25 20:46:59[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 3 rows
[32m2024-05-25 20:46:59[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.eval.jsonl to jsonl eval file dataset/ucb-DEMO-files/ucb-DEMO-ft.eval.jsonl
Filter out empty examples: 100%|██████████| 3/3 [00:00<00:00, 670.55 examples/s]
Map: 100%|███████████████████████████████| 3/3 [00:00<00:00, 1280.18 examples/s]
Map: 100%|███████████████████████████████| 3/3 [00:00<00:00, 1422.76 examples/s]
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 434.15ba/s]
