In [2]:
%load_ext autoreload
%autoreload 2

## Data Generation with Microsoft Phi-4

In [None]:
# Import required libraries
# datasets: For handling our data
# OpenAI: For interfacing with the LLM servers
# SDG components: For building our data generation pipeline
from datasets import load_dataset, Dataset
from openai import OpenAI

from sdg_hub.flow import Flow
from sdg_hub.pipeline import Pipeline
from sdg_hub.sdg import SDG
from sdg_hub.registry import PromptRegistry

## Specify model endpoint, API key

In [None]:
# Configure OpenAI client to connect to our local vLLM server
endpoint = f""
openai_api_key = ""  # vLLM doesn't require real API key
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

# Verify we can see the model
teacher_model = client.models.list().data[0].id
print(f"Connected to model: {teacher_model}")

## Provide Chunked Input json for the LLM

In [5]:
# Load the seed data from JSON file
seed_data_path = ""  # Replace with your data path
ds = load_dataset('json', data_files=seed_data_path, split='train')
# For testing, we'll use just one example
ds = ds.select(range(1))

## Check Phi4 prompt exists in PromptRegistry

In [8]:
from sdg_hub.prompts import PromptRegistry
PromptRegistry.get_registry()

{'blank': <Template memory:116aba040>,
 'instructlab': <Template memory:116ad4880>,
 'mistralai': <Template memory:116a8ef40>,
 'meta-llama/Llama-3.3': <Template memory:116b0c040>,
 'microsoft/phi-4': <Template memory:116ade2e0>,
 'nvidia/Llama-3_3-Nemotron-Super-49B-v1': <Template memory:116b0c190>,
 'Qwen/Qwen2.5': <Template memory:116afcc10>,
 'Qwen/Qwen3': <Template memory:116ac3c10>}

## Enable logging, Generate data with Phi4

In [9]:
# Load the flow configuration from YAML file
#Logging level is set to debug to see the flow in action
flow = Flow(client, log_level="debug").get_flow_from_file("synth_knowledge1.5_phi4.yaml")

# Initialize the SDG pipeline with processing parameters
sdg = SDG(
    [flow],
    num_workers=1,      # Number of parallel workers
    batch_size=1,       # Batch size for processing
    save_freq=1000,     # How often to save checkpoints
)

In [10]:

data = sdg.generate(ds)

100%|██████████| 1/1 [00:00<00:00, 19239.93it/s]


  0%|          | 0/1 [00:00<?, ?it/s]

Filter: 100%|██████████| 25/25 [00:00<00:00, 7118.64 examples/s]
Filter: 100%|██████████| 25/25 [00:00<00:00, 5678.11 examples/s]


Map: 100%|██████████| 23/23 [00:00<00:00, 3666.49 examples/s]
Filter: 100%|██████████| 23/23 [00:00<00:00, 6976.35 examples/s]
Filter: 100%|██████████| 23/23 [00:00<00:00, 5862.95 examples/s]


Map: 100%|██████████| 23/23 [00:00<00:00, 3213.17 examples/s]
Filter: 100%|██████████| 23/23 [00:00<00:00, 9263.39 examples/s]
Filter: 100%|██████████| 23/23 [00:00<00:00, 8270.66 examples/s]


100%|██████████| 1/1 [02:03<00:00, 123.95s/it]


## Analyze Contents of Generated Data

In [11]:
data

Dataset({
    features: ['document_outline', 'document_title', 'domain', 'icl_document', 'icl_query_1', 'icl_response_1', 'icl_query_2', 'icl_response_2', 'icl_query_3', 'icl_response_3', 'raw_document', 'dataset_type', 'document', 'question', 'response'],
    num_rows: 3
})

In [24]:
print("***question***: ", data[0]['question'])
print("***response***: ", data[0]['response'])    
print("***domain***: ", data[0]['domain'])
print("***raw_document/Chunked Document***: ", data[0]['raw_document'])
print("***Extractive Summary of the Document***: ", data[0]['document'])



***question***:  What was the impact of IBM's focus on high-value offerings and productivity initiatives on its operating gross profit margin in 2024?
***response***:  IBM's focus on high-value offerings and productivity initiatives led to an expansion of its operating gross profit margin by 130 basis points in 2024. This strategic emphasis contributed to improved profitability and financial performance.  

***domain***:  Standard Procedure Mannual
***raw_document/Chunked Document***:  ## Let' s Create


<!-- image -->

2024 A n nual Report


<!-- image -->

Arvind Krishna Chairman, President and Chief Executive Officer
<!-- image -->

## Dear IBM Investor:

In 2024, IBM made significant progress in becoming a higher growth, higher margin business. We made this progress by combining technology innovation and consulting expertise to drive growth, improve productivity, and enhance operational efficiency -for our clients, and our own company.

Our strategy continues to build upon the two 