# RAFT

In [12]:
! pip install -r ../requirements.txt

[0m

## Synthetic data generation phase

### Select the documents

In [13]:
import pandas as pd
ds_name = "vampire-bat-small-DEMO"
doc_path = "../sample_data/vampire-bats/bats/Desmodus draculae - Wikipedia.pdf"
ds_path = f"dataset/{ds_name}"
print("Creating dataset: " + ds_name)

Creating dataset: vampire-bat-small-DEMO


### Clean up the DEMO folder

In [14]:
# Clean up demo folder only if it's a DEMO dataset
if ds_path.endswith("DEMO"):
    import shutil
    print(f"Cleaning demo folder {ds_path}")
    shutil.rmtree(ds_path, ignore_errors=True)
    print(f"Cleaning demo checkpoints folder {ds_path}")
    shutil.rmtree(ds_path + "-checkpoints", ignore_errors=True)

Cleaning demo folder dataset/vampire-bat-small-DEMO
Cleaning demo checkpoints folder dataset/vampire-bat-small-DEMO


### Generate Q/A/CoT fine-tuning dataset using RAFT from the domain specific documents

In [15]:
! python3 ../raft.py \
    --datapath "$doc_path" \
    --output $ds_path \
    --distractors 3 \
    --doctype pdf \
    --chunk_size 512 \
    --questions 1 \
    --workers 2 \
    --system-prompt-key llama \
    --completion_model Meta-Llama-3-70B-Instruct \
    --embedding_model text-embedding-ada-002

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[32m2024-05-23 19:06:48[0m [1;30m INFO[0m [  0%] [34mraft[0m Using checkpoint chunks /workspaces/gorilla/raft/azure-ai-studio-ft/dataset/vampire-bat-small-DEMO-checkpoints/chunks
[32m2024-05-23 19:06:48[0m [1;30m INFO[0m [  0%] [34mraft[0m Retrieving chunks from ../sample_data/vampire-bats/bats/Desmodus draculae - Wikipedia.pdf of type pdf using the text-embedding-ada-002 model.
Chunking: 100%|██████████████████████| 1/1 [00:01<00:00,  1.81s/file, chunks=10]
[32m2024-05-23 19:06:50[0m [1;30m INFO[0m [  0%] [34mraft[0m Using system prompt key llama
[32m2024-05-23 19:06:50[0m [1;30m INFO[0m [  0%] [34mraft[0m Using 2 worker threads
Generating: 100%|█| 10/10 [02:01<00:00, 12.20s/chunk, qa=10, last tok/s=5.07e+4,
[32m2024-05-23 19:08:52[0m [1;30m INFO[0m [  0%] [34mraft[0m Consumed 8530 prompt tokens, 21

## Prepare training, validation and evaluation splits

In [16]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
dataset_path = f"{ds_path}-files/{ds_name}-full.jsonl"
dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"
dataset_path_ft_eval = f"{ds_path}-files/{ds_name}-ft.eval.jsonl"

print(f"Reading arrow file {raft_arrow_file}")

Reading arrow file dataset/vampire-bat-small-DEMO/data-00000-of-00001.arrow


### Export dataset to JSONL

In [17]:
! python ../format.py \
    --input $raft_arrow_file \
    --output $dataset_path_hf \
    --output-format hf

Generating train split: 10 examples [00:00, 2609.21 examples/s]
[32m2024-05-23 19:08:56[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 10 rows
[32m2024-05-23 19:08:56[0m [1;30m INFO[0m [    ] [34mraft[0m Converting arrow file dataset/vampire-bat-small-DEMO/data-00000-of-00001.arrow to jsonl hf file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-hf.full.jsonl
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 154.59ba/s]


In [18]:
hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(2)

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,de00ffcb-35b7-4eb0-9411-a1e575f24b96,general,Who is S. Turvey?,{'sentences': [['The date and reason for its e...,"Turvey, S.","To answer the question, we need to identify wh...",<DOCUMENT>The date and reason for its extincti...
1,d52a5ed2-58d4-494c-aca9-73889120a952,general,What was the estimated wingspan of D. draculae?,{'sentences': [['Its skull was long and narrow...,"Its skull was long and narrow, and its face ha...","To answer the question, we need to identify th...","<DOCUMENT>Its skull was long and narrow, and i..."


### Do the splitting

In [19]:
# split dataset into 80%/10%/10%
import numpy as np
samples_count = len(hf_full_df)
hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, [int(.8*samples_count), int(.9*samples_count)])
hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

  return bound(*args, **kwds)


### Export training and validation datasets into JSONL format

In [20]:
! python ../format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 8 examples [00:00, 1536.02 examples/s]
[32m2024-05-23 19:08:58[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 8 rows
[32m2024-05-23 19:08:58[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-hf.train.jsonl to jsonl completion file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-ft.train.jsonl
Filter out empty examples: 100%|█████████| 8/8 [00:00<00:00, 1398.16 examples/s]
Rename fields and add <STOP> token: 100%|█| 8/8 [00:00<00:00, 2559.26 examples/s
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 418.43ba/s]


In [21]:
! python ../format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 1 examples [00:00, 232.05 examples/s]
[32m2024-05-23 19:09:00[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 1 rows
[32m2024-05-23 19:09:00[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-hf.valid.jsonl to jsonl completion file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-ft.valid.jsonl
Filter out empty examples: 100%|██████████| 1/1 [00:00<00:00, 202.24 examples/s]
Rename fields and add <STOP> token: 100%|█| 1/1 [00:00<00:00, 390.57 examples/s]
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 483.33ba/s]


### Export evaluation datasets into JSONL format

In [22]:
! python ../format.py \
    --input $dataset_path_hf_eval \
    --input-type jsonl \
    --output $dataset_path_ft_eval \
    --output-format eval

Generating train split: 1 examples [00:00, 235.08 examples/s]
[32m2024-05-23 19:09:02[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 1 rows
[32m2024-05-23 19:09:02[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-hf.eval.jsonl to jsonl eval file dataset/vampire-bat-small-DEMO-files/vampire-bat-small-DEMO-ft.eval.jsonl
Filter out empty examples: 100%|██████████| 1/1 [00:00<00:00, 225.62 examples/s]
Map: 100%|████████████████████████████████| 1/1 [00:00<00:00, 418.89 examples/s]
Map: 100%|████████████████████████████████| 1/1 [00:00<00:00, 515.21 examples/s]
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 445.63ba/s]
