In [None]:
!pip install instructlab==0.19.0
!SETUPTOOLS_SCM_PRETEND_VERSION=0.1 pip install --ignore-installed --upgrade ./sdg
!pip install docling-parse==1.3.0
!pip install docling==1.16.1


In [None]:
import os
import random
from datasets import load_dataset
from utils.data import postprocess_and_save, pretty_print_dict
from instructlab.sdg.utils.docprocessor import DocProcessor

### Setup Instructions

This demo demonstrates the process of converting raw PDF files into InstructLab Synthetic Knowledge Infusion Data using the RBC POC as an example. Follow these steps to get started with your own data.

#### Steps to Get Started:

1. **Organize Your Documents:**
   - Create a new directory under the `document_collection` directory for your specific project. For example, if your project is named "my_org," your directory structure should look like this:
     ```
     |-- document_collection
     |   `-- my_org
     |       |-- my_org_data.pdf
     |       `-- qna.yaml
     ```
   - Place all your PDF files and ICL files (like `qna.yaml`) into this directory.

2. **Format Your ICLs:**
   - Ensure your ICL files contain sufficient context and question-answer pairs. We recommend including at least 5 distinct contexts, each with a minimum of 3 sets of questions and answers. More entries will improve the robustness of your data.
    - The ICL file should be in the following format (refer to the `document_collection/my_org/qna.yaml` file for an example):

    ```yaml
    domain: 
    document_outline: A one to two line description of the document
    seed_examples:
      - context: <context 1 goes here>
        question_and_answers:
          - question: <question 1 goes here>
            answer: <answer 1 goes here>
          - question: <question 2 goes here>
            answer: <answer 2 goes here>
          - question: <question 3 goes here>
            answer: <answer 3 goes here>
    ... 


   - **Note:** Replace placeholders with actual content relevant to your documents. Ensure the contexts are clear and questions are well-formulated to extract meaningful answers.

3. **Update the Data Directory Path:**
   - In the script or code where the data directory is specified, update the `data_dir` variable to reflect the path to your new directory. For example:
     ```python
     data_dir = "document_collection/my_org"
     ```
4. **Update the Output Directory Path:**
   - In the script or code where the data directory is specified, update the `output_dir` variable to reflect the path to your directory. For example:
     ```python
     data_dir = "output/my_org"
     ```
---

In [None]:
from dotenv import load_dotenv
import os

load_dotenv(override=True)

# Access the variables
data_dir = os.getenv('DATA_DIR')
output_dir = os.getenv('OUTPUT_DIR')
os.makedirs(output_dir, exist_ok=True)

### PDF Documents to Seed Dataset

To convert PDF documents into a usable seed dataset, we employ [Docling](https://github.com/DS4SD/docling), a tool designed for extracting and processing text from PDF files. The text extraction process involves parsing the PDF documents and saving the extracted text into a structured JSON file. The extracted text in JSON format can be used to generate InstructLab Synthetic Knowledge Infusion Data.


#### Step 1: 

Run the following command to extract text from the PDF documents and save it in JSON format:

⚠️ **Note:** This process takes about 5 minutes to run for this example


In [None]:
!echo $data_dir
!python ./sdg/scripts/docparser.py --input-dir {data_dir} --output-dir {output_dir}

#### Step 2: 

Now that we have extracted the text from the PDF documents, we can proceed to process the extracted data, we do the following:

- Split the extracted text into chunks 
- Populate user provided ICLs with the chunks 

In [None]:
dp = DocProcessor(output_dir, user_config_path=f'{data_dir}/qna.yaml')

In [None]:
seed_data = dp.get_processed_dataset()
seed_data

In [None]:
seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)

In [None]:
pretty_print_dict(f'{output_dir}/seed_data.jsonl')

### Convert JSONL to markdown files

In [None]:
import pandas as pd
import os
import json

# Create the output directory if it doesn't exist
md_output_dir = f"{output_dir}/md"
os.makedirs(md_output_dir, exist_ok=True)


In [None]:
def save_document(index, document_text):
    file_name = f"document_{index+1}.md"
    file_path = os.path.join(md_output_dir, file_name)
    
    with open(file_path, 'w') as f:
        f.write(document_text)
    
    print(f"Saved {file_path}")


In [None]:
jsonl_file_path = f"{output_dir}/seed_data.jsonl"

In [None]:
with open(jsonl_file_path, 'r') as f:
    saved_hashes = set()
    i = 0
    for line in f:
        entry = json.loads(line)
        document_text = entry.get('document', '')
        h = hash(document_text)
        if h not in saved_hashes:
            saved_hashes.add(h)
            save_document(i, document_text)
            i += 1