In [None]:
%load_ext autoreload
%autoreload 2

### Install SDG
From the terminal
 - cd /opt/app-root/src
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - git checkout sdg_demo

In [None]:
!pip install -r ../SDG-Research/requirements.txt
!pip install -e ../SDG-Research/.
!pip install rich datasets tabulate transformers
!pip install docling==1.20.0

<b>NB: Restart the Python kernel to ensure libraries are loaded</b>

In [None]:
# Third Party
from datasets import load_dataset
from openai import OpenAI

# First Party
from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils.docprocessor import DocProcessor
from utils.data import postprocess_and_save, pretty_print_dict
import os

### Create Seed Data

In [None]:
output_dir = f"sdg_demo_output/"
# This is where your PDFs are stored
data_dir = 'document_collection/md' 
# It also have your QNA yaml file
dp = DocProcessor(data_dir, user_config_path=f'{data_dir}/qna.yaml')
seed_data = dp.get_processed_markdown_dataset([f'{data_dir}/adv-banking.md'])
seed_data.to_json(f'{output_dir}/md-seed_data.jsonl', orient='records', lines=True)
pretty_print_dict(f'{output_dir}/md-seed_data.jsonl')



### Setup OpenAI Client for interacting with the model

In [None]:
endpoint = os.environ['MIXTRAL_URL']
openai_api_key = os.environ['MIXTRAL_TOKEN']
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

In [None]:
knowledge_agentic_pipeline = "../SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [None]:
number_of_samples = 5
ds = load_dataset('json', data_files=f'{output_dir}/md-seed_data.jsonl', split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [None]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

### Save the generated data into training format

In [None]:
generated_data.to_json(f"{output_dir}/gen.jsonl", orient='records', lines=True)

In [None]:
# Use the system prompt for RHELAI 1.4.1
system_prompt_rhelai_1_4_1 = (
    "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.1-8b-base model. My primary role is to serve as a chat assistant."
)
precomputed_skills_path = "1.4.1/skills.jsonl"
# Download the RHELAI 1.4.1 data here: https://drive.google.com/file/d/1q8Rxcat5dZxXP-LqgPSCUsyttyAn6aLJ/view?usp=sharing
# Unzip the folder and put the path to skills.jsonl in precomputed_skills_path
postprocess_and_save(f"{output_dir}/gen.jsonl", dataset_save_path=f'{output_dir}', precomputed_skills_path=precomputed_skills_path, sys_prompt=system_prompt_rhelai_1_4_1)

Exercise complete, training data should be located in `sdg_demo_output\phase10_train.jsonl`