# RAFT

In [1]:
! pip install -r ../requirements.txt

[0m

## Synthetic data generation phase

### Select the documents

In [2]:
import pandas as pd
ds_name = "ucb-DEMO"
doc_path = "../sample_data/UC_Berkeley_short.pdf"
ds_path = f"dataset/{ds_name}"
print("Creating dataset: " + ds_name)

Creating dataset: ucb-DEMO


### Clean up the DEMO folder

In [3]:
# Clean up demo folder only if it's a DEMO dataset
if ds_path.endswith("DEMO"):
    import shutil
    print(f"Cleaning demo folder {ds_path}")
    shutil.rmtree(ds_path, ignore_errors=True)
    print(f"Cleaning demo checkpoints folder {ds_path}")
    shutil.rmtree(ds_path + "-checkpoints", ignore_errors=True)

Cleaning demo folder dataset/ucb-DEMO
Cleaning demo checkpoints folder dataset/ucb-DEMO


### Generate Q/A/CoT fine-tuning dataset using RAFT from the domain specific documents

In [4]:
! python3 ../raft.py \
    --datapath "$doc_path" \
    --output $ds_path \
    --distractors 3 \
    --doctype pdf \
    --chunk_size 512 \
    --questions 1 \
    --workers 2 \
    --system-prompt-key llama \
    --completion_model Meta-Llama-3-70B-Instruct \
    --embedding_model text-embedding-ada-002

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[32m2024-05-29 01:09:41[0m [1;30m INFO[0m [  0%] [34mraft[0m Using checkpoint chunks /workspaces/gorilla/raft/azure-ai-studio-ft/dataset/ucb-DEMO-checkpoints/chunks
[32m2024-05-29 01:09:41[0m [1;30m INFO[0m [  0%] [34mraft[0m Retrieving chunks from ../sample_data/UC_Berkeley_short.pdf of type pdf using the text-embedding-ada-002 model.
Chunking: 100%|██████████████████████| 1/1 [00:01<00:00,  1.48s/file, chunks=30]
[32m2024-05-29 01:09:43[0m [1;30m INFO[0m [  0%] [34mraft[0m Using system prompt key llama
[32m2024-05-29 01:09:43[0m [1;30m INFO[0m [  0%] [34mraft[0m Using 2 worker threads
Generating: 100%|█| 30/30 [03:05<00:00,  6.18s/chunk, qa=30, last tok/s=1.05e+4,
[32m2024-05-29 01:12:50[0m [1;30m INFO[0m [  0%] [34mraft[0m Consumed 24964 prompt tokens, 5776 completion tokens, 30740 total tokens
Sa

## Prepare training, validation and evaluation splits

In [3]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
dataset_path = f"{ds_path}-files/{ds_name}-full.jsonl"
dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"
dataset_path_ft_eval = f"{ds_path}-files/{ds_name}-ft.eval.jsonl"

print(f"Reading arrow file {raft_arrow_file}")

Reading arrow file dataset/ucb-DEMO/data-00000-of-00001.arrow


### Export dataset to JSONL

In [4]:
! python ../format.py \
    --input $raft_arrow_file \
    --output $dataset_path_hf \
    --output-format hf

[32m2024-05-29 01:44:48[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 30 rows
[32m2024-05-29 01:44:48[0m [1;30m INFO[0m [    ] [34mraft[0m Converting arrow file dataset/ucb-DEMO/data-00000-of-00001.arrow to jsonl hf file dataset/ucb-DEMO-files/ucb-DEMO-hf.full.jsonl
Creating json from Arrow format: 100%|████████████| 1/1 [00:00<00:00, 37.33ba/s]


In [5]:
hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(2)

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,daec9f54-28e1-4c1a-80ca-5de787542796,general,What was the reason for Jack Weinberg's arrest...,{'sentences': [['The school later told U.S. Ne...,"In 1964, the Free\nSpeech Movement organized s...","To answer the question, we need to identify th...",<DOCUMENT>The school\nlater told U.S. News the...
1,f3dbd73d-ef4f-4abd-9446-efcec12712ec,general,What was Berkeley's originally reported two-ye...,"{'sentences': [['Physics professor J.', 'Unive...",Berkeley had originally reported that its two-...,"To answer the question, we need to identify Be...",<DOCUMENT>Physics professor J.</DOCUMENT>\n<DO...


### Do the splitting

In [6]:
# split dataset into 80%/10%/10%
import numpy as np
samples_count = len(hf_full_df)
hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, [int(.8*samples_count), int(.9*samples_count)])
hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

  return bound(*args, **kwds)


### Export training and validation datasets into JSONL format

In [7]:
! python ../format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 24 examples [00:00, 4515.87 examples/s]
[32m2024-05-29 01:44:56[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 24 rows
[32m2024-05-29 01:44:56[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.train.jsonl to jsonl completion file dataset/ucb-DEMO-files/ucb-DEMO-ft.train.jsonl
Filter out empty examples: 100%|███████| 24/24 [00:00<00:00, 4229.73 examples/s]
Rename fields and add <STOP> token: 100%|█| 24/24 [00:00<00:00, 8546.72 examples
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 368.99ba/s]


In [8]:
! python ../format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format completion \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

Generating train split: 3 examples [00:00, 656.01 examples/s]
[32m2024-05-29 01:44:59[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 3 rows
[32m2024-05-29 01:44:59[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.valid.jsonl to jsonl completion file dataset/ucb-DEMO-files/ucb-DEMO-ft.valid.jsonl
Filter out empty examples: 100%|██████████| 3/3 [00:00<00:00, 641.36 examples/s]
Rename fields and add <STOP> token: 100%|█| 3/3 [00:00<00:00, 1126.69 examples/s
Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 302.75ba/s]


In [9]:
dataset_path_ft_valid_df = pd.read_json(dataset_path_ft_valid, lines=True)
dataset_path_ft_valid_df.head(2)

Unnamed: 0,text,ground_truth
0,<DOCUMENT>Holden\n1888–1890 Horace Davis\n1890...,"To answer the question, we need to identify th..."
1,<DOCUMENT>Strong\n1965–1965 Martin E.</DOCUMEN...,"To answer the question, we need to identify th..."


### Export evaluation dataset into JSONL format

In [10]:
! python ../format.py \
    --input $dataset_path_hf_eval \
    --input-type jsonl \
    --output $dataset_path_ft_eval \
    --output-format eval

Generating train split: 3 examples [00:00, 651.90 examples/s]
[32m2024-05-29 01:45:09[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 3 rows
[32m2024-05-29 01:45:09[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/ucb-DEMO-files/ucb-DEMO-hf.eval.jsonl to jsonl eval file dataset/ucb-DEMO-files/ucb-DEMO-ft.eval.jsonl
Filter out empty examples: 100%|██████████| 3/3 [00:00<00:00, 471.08 examples/s]
Map: 100%|████████████████████████████████| 3/3 [00:00<00:00, 974.21 examples/s]
Map: 100%|███████████████████████████████| 3/3 [00:00<00:00, 1309.49 examples/s]
Creating json from Arrow format: 100%|████████████| 1/1 [00:00<00:00, 50.96ba/s]


In [11]:
dataset_path_ft_eval_df = pd.read_json(dataset_path_ft_eval, lines=True)
dataset_path_ft_eval_df.head(2)

Unnamed: 0,question,gold_final_answer,context
0,What is considered the unofficial flagship of ...,Berkeley,<DOCUMENT>Although the University of Californi...
1,In what year was the Simons Institute for the ...,2012,<DOCUMENT>Birgeneau\n2013–2017 Nicholas B.</DO...
