In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
```bash
git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
pip install -e .
pip install rich datasets
```

In [2]:
# Third Party
from datasets import load_dataset, Dataset
from openai import OpenAI
import click

# First Party
from instructlab.sdg.flow import Flow
from instructlab.sdg.logger_config import setup_logger
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from tqdm import tqdm
import json
import itertools
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
endpoint = f"http://localhost:8000/v1"
openai_api_key = "EMPTY"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

meta-llama/Llama-3.3-70B-Instruct


In [4]:
flow_cfg = Flow(client).get_flow_from_file("/home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_llama3.3.yaml")
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [27]:
ds = load_dataset('json', data_files='/new_data/knowledge/BMO/documents/seed_data.jsonl', split='train')
ds = ds.select(range(1))

In [28]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

100%|██████████| 1/1 [00:00<00:00, 24966.10it/s]


  0%|          | 0/1 [00:00<?, ?it/s]

Filter: 100%|██████████| 59/59 [00:00<00:00, 15414.47 examples/s]
Filter: 100%|██████████| 59/59 [00:00<00:00, 13580.50 examples/s]


Map: 100%|██████████| 59/59 [00:00<00:00, 6628.02 examples/s]
Filter: 100%|██████████| 59/59 [00:00<00:00, 18929.39 examples/s]
Filter: 100%|██████████| 59/59 [00:00<00:00, 14606.54 examples/s]


Map: 100%|██████████| 56/56 [00:00<00:00, 6346.59 examples/s]
Filter: 100%|██████████| 56/56 [00:00<00:00, 18718.60 examples/s]
Filter: 100%|██████████| 56/56 [00:00<00:00, 14252.49 examples/s]


100%|██████████| 1/1 [01:03<00:00, 63.86s/it]


### How to add new prompt template

In [None]:
# How to get the chat template for prompt registry
# Once you get that add new prompt template in src/instructlab/sdg/prompts.py
# Then add the model name in src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_llama3.3.yaml
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
tokenizer.chat_template

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Now hit mistral endpoint

In [16]:
# Vllm server is running on 10.7.0.15
mistral_client = OpenAI(
    api_key="EMPTY",
    base_url=f"http://10.7.0.15:8000/v1",
)
mistral_client_teacher_model = mistral_client.models.list().data[0].id
print(mistral_client_teacher_model)

mistralai/Mixtral-8x7B-Instruct-v0.1


In [21]:
# Create flow with mistral config
flow_cfg_mistral = Flow(mistral_client).get_flow_from_file("/home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml")
sdg_mistral = SDG(
    [Pipeline(flow_cfg_mistral)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [29]:
generated_data_mistral = sdg_mistral.generate(ds, checkpoint_dir="Tmp")

100%|██████████| 1/1 [00:00<00:00, 20360.70it/s]


  0%|          | 0/1 [00:00<?, ?it/s]

Filter: 100%|██████████| 41/41 [00:00<00:00, 13716.72 examples/s]
Filter: 100%|██████████| 41/41 [00:00<00:00, 11847.50 examples/s]


Map: 100%|██████████| 37/37 [00:00<00:00, 5085.00 examples/s]
Filter: 100%|██████████| 37/37 [00:00<00:00, 14035.38 examples/s]
Filter: 100%|██████████| 37/37 [00:00<00:00, 11514.26 examples/s]


Map: 100%|██████████| 36/36 [00:00<00:00, 5031.82 examples/s]
Filter: 100%|██████████| 36/36 [00:00<00:00, 12447.03 examples/s]
Filter: 100%|██████████| 36/36 [00:00<00:00, 10765.36 examples/s]


100%|██████████| 1/1 [00:32<00:00, 32.44s/it]


### Compare the results

In [31]:
# Print the first row of generated data and mistral generated data
k = 5  # Number of examples to dump
output_file = "model_comparison.md"
with open(output_file, "w") as f:
    f.write(f"### Document \n{generated_data[0]['document']}")
    for i in range(min(len(generated_data), len(generated_data_mistral))):
        f.write("Example #{}\n".format(i+1))
        f.write("### Result from llama3.3\n")
        f.write(generated_data[i]['question'] + "\n")
        f.write("*******************************\n")
        f.write(generated_data[i]['response'] + "\n")
        f.write("=================================\n")
        f.write("### Result from mistral\n") 
        f.write(generated_data_mistral[i]['question'] + "\n")
        f.write("*******************************\n")
        f.write(generated_data_mistral[i]['response'] + "\n")
        f.write("\n\n")

print(f"Wrote {k} examples to {output_file}")

Wrote 5 examples to model_comparison.md


### For reference

### How to start vllm server
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m vllm.entrypoints.openai.api_server \
    --model mistralai/Mixtral-8x7B-Instruct-v0.1 \
    --dtype float16 \
    --tensor-parallel-size 8 


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Llama-3.3-70B-Instruct \
    --dtype float16 \
    --tensor-parallel-size 8 
```

### How to run the final generation
```bash
python scripts/generate.py --ds_path /new_data/knowledge/BMO/documents/seed_data.jsonl \
    --bs 2 --num_workers 10 \
    --save_path <your_save_path> \
    --flow /home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml \
    --checkpoint_dir <your_checkpoint_dir> \
    --endpoint <your_endpoint>
```

* For llama3.3 change the flow to `/home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_llama3.3.yaml`