In [3]:
%load_ext autoreload
%autoreload 2

### Install Dependencies
From the terminal
 - cd /opt/app-root/src
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - git checkout sdg_demo

In [None]:
!pip install -r ../SDG-Research/requirements.txt
!pip install ../SDG-Research/.
!pip install -r requirements.txt


Loads environment variables from `.env` file and sets up the input directory path.  You can add or change any environment variables


In [20]:
from dotenv import load_dotenv
from pathlib import Path
import os

load_dotenv(override=True)

# Access the variables
input_dir = Path(os.getenv("INPUT_DIR", "document_collection"))
output_dir = Path(os.getenv("OUTPUT_DIR", "sdg_demo_output"))

### Convert PDFs to Markdown

It is possible to create seed data directly from PDF, but in order to inspect the Docling results, it's often helpful to use Markdown as an intermediate format.  Here we will convert all PDFs in the input directory (document_collection by default) to markdowns in the same location.  Afterwards, correct these markdown files if Docling has made any conversion errors.

In [None]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import ConversionStatus



def convert_to_markdown(
        conversion_results: Iterable[ConversionResult],
        overwrite: bool = False        
):
    success_count = 0
    failure_count = 0

    for conv_res in conversion_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_directory = conv_res.input.file.parent
            doc_filename =  conv_res.input.file.stem
            markdown_file = doc_directory / f"{doc_filename}.md"
            
            if overwrite or not markdown_file.exists():
                print(f"Exporting {markdown_file}...")
                with markdown_file.open("w") as fp:
                    fp.write(conv_res.legacy_document.export_to_markdown())
            else:
                print(f"Skipping {markdown_file} because it already exists.")
        else:
            print(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1
    
    return success_count, failure_count


pdf_files = list(input_dir.rglob("*.pdf"))
doc_converter = DocumentConverter()
conversion_results = doc_converter.convert_all(pdf_files)
convert_to_markdown(conversion_results, overwrite=False)


#### Manually correct the Markdown

Make sure to inspect and correct the newly created markdown files in the input directory.  They will be used (instead of the PDFs) to create the seed data for synthetic data generation.

### Convert Markdown to JSON

Now that 

### Create Seed Data 

Now we need to create seed_data.jsonl for synthetic data generation


In [None]:
[str(path) for path in Path("document_collection/md").glob("*.md")]

In [None]:
from pathlib import Path
from instructlab.sdg.utils.docprocessor import DocProcessor
 

def create_seed_data(
    qna_yaml_path: Path,
    overwrite: bool = True
):
    """
    Create seed data from a YAML file and save it as a JSONL file.
    
    Args:
        qna_yaml_path (Path): Path to the YAML file containing question-answer pairs.
        output_jsonl_path (Path): Path to save the JSONL file.
    """
    # Get the directory of the YAML file
    yaml_dir = qna_yaml_path.parent
    # Get all markdown files in the directory
    md_files = list(yaml_dir.glob("*.md"))
    
    # Check if we have any markdown files
    if not md_files:
        print(f"No markdown files found in {yaml_dir}")
        return
    else:
        print(f"Found {len(md_files)} markdown files in {yaml_dir}")
    
    
    dp = DocProcessor(yaml_dir, user_config_path=qna_yaml_path)
    seed_data = dp.get_processed_markdown_dataset([str(path) for path in md_files])
    seed_data_path = f"{yaml_dir}/seed_data.jsonl"    
    seed_data.to_json(seed_data_path, orient="records", lines=True)
    return seed_data_path
    

def create_seed_data_for_all_qna_files(input_dir: Path, overwrite: bool = True):
    """
    Find all qna.yaml files in the input directory and create seed data for each one.
    
    Args:
        input_dir (Path): Directory to search for qna.yaml files.
        overwrite (bool, optional): Whether to overwrite existing seed data files. Defaults to True.
    """
    # Find all qna.yaml files in the input directory and its subdirectories
    qna_files = list(input_dir.rglob("qna.yaml"))
    
    if not qna_files:
        print(f"No qna.yaml files found in {input_dir}")
        return
    
    print(f"Found {len(qna_files)} qna.yaml files")
    
    # Process each qna.yaml file
    for qna_file in qna_files:
        print(f"Processing {qna_file}")
        try:
            create_seed_data(qna_file, overwrite=overwrite)
            print(f"Successfully created seed data for {qna_file}")
        except Exception as e:
            print(f"Error processing {qna_file}: {str(e)}")


create_seed_data_for_all_qna_files(input_dir, overwrite=True)


In [None]:
# %% Concatenate all seed_data.jsonl files

import json
import glob
from pathlib import Path

def concatenate_seed_data_files(input_dir: Path, output_path: str = "./seed_data.jsonl"):
    """
    Find all seed_data.jsonl files in the input directory and concatenate them into a single file.
    
    Args:
        input_dir (Path): Directory to search for seed_data.jsonl files.
        output_path (str, optional): Path of the output file. Defaults to "./seed_data.jsonl".
    
    Returns:
        Path: Path to the combined seed data file.
    """
    # Find all seed_data.jsonl files in the input directory and its subdirectories
    seed_data_files = list(input_dir.rglob("seed_data.jsonl"))
    
    if not seed_data_files:
        print(f"No seed_data.jsonl files found in {input_dir}")
        return None
    
    print(f"Found {len(seed_data_files)} seed_data.jsonl files")
        
    # Concatenate all seed data files
    combined_data = []
    for seed_file in seed_data_files:
        print(f"Processing {seed_file}")
        try:
            with open(seed_file, 'r') as f:
                for line in f:
                    # Parse each line as JSON and add to combined data
                    combined_data.append(json.loads(line))
        except Exception as e:
            print(f"Error processing {seed_file}: {str(e)}")
    
    # Write the combined data to the output file
    with open(output_path, 'w') as f:
        for item in combined_data:
            f.write(json.dumps(item) + '\n')
    
    print(f"Successfully created combined seed data file at {output_path}")
    print(f"Total records: {len(combined_data)}")
    
    return output_path


os.makedirs(output_dir, exist_ok=True)
seed_data_path = concatenate_seed_data_files(input_dir, Path(output_dir) / "seed_data.jsonl")


### Setup OpenAI Client for interacting with the model

In [None]:
endpoint = os.environ['MIXTRAL_URL']
openai_api_key = os.environ['MIXTRAL_TOKEN']
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

In [None]:
knowledge_agentic_pipeline = "scripts/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [9]:
number_of_samples = 5
ds = load_dataset('json', data_files=seed_data_path, split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [None]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

### Save the generated data into training format

In [None]:
generated_data.to_json(f"{output_dir}/gen.jsonl", orient='records', lines=True)

In [None]:
# Use the system prompt for RHELAI 1.4.1
system_prompt_rhelai_1_4_1 = (
    "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.1-8b-base model. My primary role is to serve as a chat assistant."
)
precomputed_skills_path = "1.4.1/skills.jsonl"
# Download the RHELAI 1.4.1 data here: https://drive.google.com/file/d/1q8Rxcat5dZxXP-LqgPSCUsyttyAn6aLJ/view?usp=sharing
# Unzip the folder and put the path to skills.jsonl in precomputed_skills_path
postprocess_and_save(f"{output_dir}/gen.jsonl", dataset_save_path=f'{output_dir}', precomputed_skills_path=precomputed_skills_path, sys_prompt=system_prompt_rhelai_1_4_1)

Exercise complete, training data should be located in `sdg_demo_output/phase10_train.jsonl`