In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Install SDG
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - git checkout sdg_demo

In [None]:
!pip install -r ../SDG-Research/requirements.txt
!pip install -e ../SDG-Research/.
!pip install rich datasets tabulate transformers
!pip install docling==1.20.0

In [1]:
# Third Party
from datasets import load_dataset
from openai import OpenAI
import click

# First Party
from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils.docprocessor import DocProcessor
from utils.data import postprocess_and_save, pretty_print_dict

### Create Seed Data

In [22]:
!OMP_NUM_THREADS=32 mamba run -n docling python /workspace/home/lab/abhi/sdg_demo/SDG-Research/scripts/docparser.py --input-dir {data_dir} --output-dir {data_dir}

Not a conda environment: /workspace/home/lab/.conda/envs/docling

EnvironmentLocationNotFound: Not a conda environment: /workspace/home/lab/.conda/envs/docling



In [5]:
!python scripts/docparser.py --input-dir document_collection/RBC --output-dir document_collection/RBC

Fetching 10 files:   0%|                                 | 0/10 [00:00<?, ?it/s]
README.md: 100%|███████████████████████████| 3.49k/3.49k [00:00<00:00, 52.6MB/s][A

model.pt:   0%|                                      | 0.00/201M [00:00<?, ?B/s][A

tm_config.json: 100%|██████████████████████| 7.09k/7.09k [00:00<00:00, 81.9MB/s][A[A


.gitattributes: 100%|██████████████████████| 1.71k/1.71k [00:00<00:00, 22.7MB/s][A[A
Fetching 10 files:  10%|██▌                      | 1/10 [00:00<00:03,  2.37it/s]

otslp_all_standard_094_clean.check:   0%|            | 0.00/213M [00:00<?, ?B/s][A[A


config.json: 100%|████████████████████████████| 41.0/41.0 [00:00<00:00, 732kB/s][A[A[A



.gitignore: 100%|██████████████████████████| 5.18k/5.18k [00:00<00:00, 54.5MB/s][A[A[A



model.pt:   0%|                                      | 0.00/169M [00:00<?, ?B/s][A[A[A
model.pt:  10%|███▏                          | 21.0M/201M [00:00<00:01, 147MB/s][A



otslp_all_fast.check:   0%|            

In [6]:
output_dir = f"sdg_demo_output/"
# This is where your PDFs are stored
data_dir = 'document_collection/RBC' 
# It also have your QNA yaml file
dp = DocProcessor(data_dir, user_config_path=f'{data_dir}/qna.yaml')
seed_data = dp.get_processed_dataset()
seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)
pretty_print_dict(f'{output_dir}/seed_data.jsonl')


tokenizer_config.json:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1404 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Setup OpenAI Client for interacting with the model

In [None]:
endpoint = f"https://xxx.com:443/v1"
openai_api_key = ""
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

mistralai/Mixtral-8x7B-Instruct-v0.1


### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

In [8]:
knowledge_agentic_pipeline = "src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [9]:
number_of_samples = 5
ds = load_dataset('json', data_files=f'{output_dir}/seed_data.jsonl', split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [10]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

100%|██████████| 5/5 [00:00<00:00, 119156.36it/s]


  0%|          | 0/5 [00:00<?, ?it/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

 20%|██        | 1/5 [01:48<07:13, 108.27s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

 40%|████      | 2/5 [03:22<05:00, 100.12s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

 60%|██████    | 3/5 [04:48<03:06, 93.36s/it] 

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/27 [00:00<?, ? examples/s]

Filter:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]

 80%|████████  | 4/5 [05:56<01:23, 83.65s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Filter:   0%|          | 0/34 [00:00<?, ? examples/s]

Filter:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Filter:   0%|          | 0/31 [00:00<?, ? examples/s]

Filter:   0%|          | 0/31 [00:00<?, ? examples/s]

100%|██████████| 5/5 [07:30<00:00, 90.12s/it]


### Run SDG through python command (For large scale generation)

```python
python /home/lab/sdg/scripts/generate.py --ds_path {output_dir}/seed_data.jsonl --bs 8 --num_workers 8 --save_path {output_dir}/gen.jsonl --flow SynthKnowledgeFlow1.5 --endpoint {teacher_endpoint_url} --checkpoint_dir {output_dir}/data_checkpoints --save_freq 2
```

### Save the generated data into training format

In [11]:
generated_data.to_json(f"{output_dir}/gen.jsonl", orient='records', lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

741628

In [12]:
# Use the system prompt for RHELAI 1.4.1
system_prompt_rhelai_1_4_1 = (
    "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.1-8b-base model. My primary role is to serve as a chat assistant."
)
precomputed_skills_path = "1.4.1/skills.jsonl"
# Download the RHELAI 1.4.1 data here: https://drive.google.com/file/d/1q8Rxcat5dZxXP-LqgPSCUsyttyAn6aLJ/view?usp=sharing
# Unzip the folder and put the path to skills.jsonl in precomputed_skills_path
postprocess_and_save(f"{output_dir}/gen.jsonl", dataset_save_path=f'{output_dir}', precomputed_skills_path=precomputed_skills_path, sys_prompt=system_prompt_rhelai_1_4_1)

Generating train split: 0 examples [00:00, ? examples/s]

num_proc must be <= 51. Reducing num_proc to 51 for dataset of size 51.


Filter (num_proc=51):   0%|          | 0/51 [00:00<?, ? examples/s]

num_proc must be <= 51. Reducing num_proc to 51 for dataset of size 51.


Map (num_proc=51):   0%|          | 0/51 [00:00<?, ? examples/s]

num_proc must be <= 51. Reducing num_proc to 51 for dataset of size 51.


Filter (num_proc=51):   0%|          | 0/51 [00:00<?, ? examples/s]

num_proc must be <= 51. Reducing num_proc to 51 for dataset of size 51.


Filter (num_proc=51):   0%|          | 0/51 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

num_proc must be <= 62. Reducing num_proc to 62 for dataset of size 62.


Map (num_proc=62):   0%|          | 0/62 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=72):   0%|          | 0/346544 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/347 [00:00<?, ?ba/s]