In [7]:
import re
import os
from pathlib import Path
from typing import List, Tuple

def clean_text(text: str) -> str:
    """Remove excessive newlines and trim"""
    text = re.sub(r'\n{3,}', '\n\n', text.strip())
    return text

def extract_qa_and_followups(content: str) -> Tuple[List[Tuple[str, str, List[str]]], List[str]]:
    """
    Returns:
        - List of (question, answer, [followup1, followup2, followup3])
        - List of all main questions (for the flat questions file)
    """
    # Normalize line endings
    content = content.replace('\r\n', '\n').replace('\r', '\n')
    lines = content.split('\n')

    qa_pairs = []
    all_main_questions = []
    current_question = None
    current_answer_lines = []
    in_answer = False

    # Patterns for detecting start of user question and start of ChatGPT response
    user_markers = [
        r'^(?:U|User|You):?\s*(.+)$',
        r'^Q:\s*(.+)$',
        r'^\s*You\s+asked:?\s*(.+)$',
    ]
    assistant_markers = [
        r'^(?:ChatGPT|Chat CPT|Assistant):?\s*(.*)$',
        r'^Answer:?\s*(.*)$',
        r'^\[ChatGPT\]\s*(.*)$',
    ]

    # Follow-up patterns (very flexible)
    followup_indicators = [
        r'(?:here are|these are|some|following|recommended|next|related|follow.?up|you might also|want to ask|try asking|more questions).*?:\s*$',
        r'^(?:\d[\.\)]\s*|[\-\*]\s*)(.+?)(?:\?|\.{3})?$',
    ]

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Detect start of new question
        question_matched = False
        for pattern in user_markers:
            m = re.match(pattern, line, re.IGNORECASE)
            if m:
                if current_question and current_answer_lines:
                    # Save previous Q&A
                    answer = clean_text('\n'.join(current_answer_lines))
                    qa_pairs.append((current_question, answer, []))
                    all_main_questions.append(current_question)

                current_question = m.group(1).strip()
                current_answer_lines = []
                in_answer = False
                question_matched = True
                break

        if question_matched:
            i += 1
            continue

        # Detect start of assistant response
        for pattern in assistant_markers:
            m = re.match(pattern, line, re.IGNORECASE)
            if m and current_question:
                in_answer = True
                if m.group(1).strip():
                    current_answer_lines.append(m.group(1).strip())
                i += 1
                break
        else:
            # Continuation line
            if in_answer and line:
                current_answer_lines.append(line)
            i += 1

    # Don't forget the last pair
    if current_question and current_answer_lines:
        answer = clean_text('\n'.join(current_answer_lines))
        qa_pairs.append((current_question, answer, []))

    # Now extract follow-ups from each answer
    for idx, (q, a, _) in enumerate(qa_pairs):
        followups = []
        lines_a = a.split('\n')

        collecting = False
        for line in lines_a:
            line_s = line.strip()

            # Start collecting follow-ups?
            if not collecting:
                for pat in followup_indicators:
                    if re.search(pat, line_s, re.IGNORECASE):
                        collecting = True
                        break
                if collecting:
                    continue

            if collecting:
                # Look for numbered or bulleted questions
                m = re.match(r'^(?:\d[\.\)]\s*|[\-\*]\s*)(.+?\??)$', line_s)
                if m:
                    fu = m.group(1).strip()
                    if fu and fu.endswith('?') or '...' in fu or len(fu) > 15:
                        followups.append(fu)
                elif line_s and not line_s.startswith('---') and not line_s.startswith('==='):
                    # sometimes just plain lines after the header
                    if '?' in line_s:
                        followups.append(line_s)

                # Stop collecting after ~4 lines or empty line + non-question
                if len(followups) >= 3 or (not line_s and len(followups) > 0):
                    break

        # Keep only up to 3 good ones
        followups = [f.strip() for f in followups if f.strip() and '?' in f][:3]
        qa_pairs[idx] = (q, a, followups)

    return qa_pairs, all_main_questions


def process_file(input_path: str, output_dir: str):
    """Process one file and create all output markdown files"""
    with open(input_path, encoding='utf-8') as f:
        content = f.read()

    qa_pairs, main_questions = extract_qa_and_followups(content)

    base_name = Path(input_path).stem
    os.makedirs(output_dir, exist_ok=True)

    # 1. Individual Q&A files
    for i, (q, a, _) in enumerate(qa_pairs, 1):
        filename = f"{base_name}_Q{i}.md"
        path = os.path.join(output_dir, filename)

        with open(path, 'w', encoding='utf-8') as f:
            f.write(f"# Question {i}\n\n")
            f.write(f"**Q:** {q}\n\n")
            f.write(f"**A:**\n\n{a}\n")

        print(f"Created: {filename}")

    # 2. Hierarchy file
    hierarchy_path = os.path.join(output_dir, f"{base_name}_questions_hierarchy.md")
    with open(hierarchy_path, 'w', encoding='utf-8') as f:
        f.write("# Conversation Question Hierarchy\n\n")

        for i, (q, _, followups) in enumerate(qa_pairs, 1):
            f.write(f"## Q{i}\n")
            f.write(f"{q}\n\n")

            if followups:
                f.write("**Follow-ups / Recommended:**\n\n")
                for j, fu in enumerate(followups, 1):
                    f.write(f"- **Q{i}.{j}**  {fu}\n")
            f.write("\n---\n\n")

        # Also plain list of all main questions
        f.write("\n## All Main Questions (flat list)\n\n")
        for i, q in enumerate(main_questions, 1):
            f.write(f"- Q{i}: {q}\n")

    print(f"Created hierarchy: {Path(hierarchy_path).name}")


# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Usage examples
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

if __name__ == "__main__":
    # Change these paths
    INPUT_FILE = "C:\\Users\\GRL\\Downloads\\Synthetic Data in USB-PD.md"
#               ‚Üë just add r before the quote        # ‚Üê your file
    OUTPUT_FOLDER = "extracted_questions"          # ‚Üê where to save

    # Process one file
    process_file(INPUT_FILE, OUTPUT_FOLDER)

    # Or process many files in a folder
    # input_folder = "all_exports"
    # for file in Path(input_folder).glob("*.md"):
    #     process_file(str(file), OUTPUT_FOLDER)

Created hierarchy: Synthetic Data in USB-PD_questions_hierarchy.md


In [17]:
import re
import os
from pathlib import Path
from typing import List, Tuple


def clean_text(text: str) -> str:
    """Remove excessive newlines, junk separators, and trim"""
    # Remove common junk separator lines
    text = re.sub(r'^\s*\* \* \*\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n{3,}', '\n\n', text.strip())
    return text


def extract_followups(answer: str) -> List[str]:
    """Extract follow-up questions from typical ChatGPT endings"""
    lines = answer.split('\n')
    followups = []
    collecting = False

    for line in lines:
        s = line.strip()
        lower = s.lower()

        # Start collecting mode
        if not collecting:
            if any(p in lower for p in [
                'if you want next', 'next we can', 'if you want, next i can',
                'want next', 'next, i can', 'next steps', 'next question',
                'next i can', 'shall we', 'my opinion', 'if you want'
            ]):
                collecting = True
            continue

        # Likely follow-up items
        if s and (
            s.startswith(('- ', '* ', '1. ', '2. ', '3. ', '‚Ä¢ ', '- **')) or
            (s.endswith('?') and len(s) > 15) or
            any(w in lower for w in ['design ', 'show ', 'map ', 'draft ', 'help '])
        ):
            cleaned = re.sub(r'^[-*‚Ä¢\d.\s*]+|\s*\**', '', s).strip()
            if cleaned and cleaned not in followups:
                followups.append(cleaned)

        if len(followups) >= 3:
            break

    return followups[:3]


def extract_qa_and_followups(content: str) -> Tuple[List[Tuple[str, str, List[str]]], List[str]]:
    """
    Robust parser for this specific ChatGPT export style with junk lines
    """
    content = clean_text(content.replace('\r\n', '\n'))

    # Split BEFORE each **You:** or **ChatGPT:** marker
    parts = re.split(r'(?=\n\*\*(?:You|ChatGPT)\*\*:)', '\n' + content + '\n')

    qa_pairs = []
    all_main_questions = []
    current_question = None
    current_answer_parts = []

    for part in parts:
        part = part.strip()
        if not part:
            continue

        if part.startswith("**You**:"):
            # Save previous pair if exists
            if current_question and current_answer_parts:
                answer = clean_text('\n'.join(current_answer_parts))
                followups = extract_followups(answer)
                qa_pairs.append((current_question, answer, followups))
                all_main_questions.append(current_question)

            # New question
            question_text = part[8:].strip()
            question_text = clean_text(question_text)

            # Skip exact duplicate of previous question
            if qa_pairs and question_text == qa_pairs[-1][0]:
                current_question = None
                current_answer_parts = []
                continue

            current_question = question_text
            current_answer_parts = []

        elif part.startswith("**ChatGPT**:") and current_question:
            # This is the answer (may contain multiple lines)
            answer_content = part[12:].strip()
            current_answer_parts.append(answer_content)

    # Save last pair
    if current_question and current_answer_parts:
        answer = clean_text('\n'.join(current_answer_parts))
        followups = extract_followups(answer)
        qa_pairs.append((current_question, answer, followups))
        all_main_questions.append(current_question)

    return qa_pairs, all_main_questions


def process_file(input_path: str, output_dir: str):
    """Main processing function"""
    try:
        with open(input_path, encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    print(f"File loaded: {len(content):,} characters")

    qa_pairs, main_questions = extract_qa_and_followups(content)

    if not qa_pairs:
        print("WARNING: No question-answer pairs were detected.")
        print("First 800 characters of cleaned content:")
        cleaned = clean_text(content)
        print(cleaned[:800])
        print("-" * 80)
        return

    print(f"Found {len(qa_pairs)} question-answer pairs")

    base_name = Path(input_path).stem
    os.makedirs(output_dir, exist_ok=True)

    # Individual Q&A files
    for i, (q, a, followups) in enumerate(qa_pairs, 1):
        filename = f"{base_name}_Q{i}.md"
        path = os.path.join(output_dir, filename)

        with open(path, 'w', encoding='utf-8') as f:
            f.write(f"# Question {i}\n\n")
            f.write(f"**Question:**\n{q}\n\n")
            f.write(f"**Answer:**\n\n{a}\n\n")

            if followups:
                f.write("**Extracted follow-ups:**\n")
                for j, fu in enumerate(followups, 1):
                    f.write(f"- {fu}\n")

        print(f"Created: {filename}")

    # Hierarchy file
    hierarchy_path = os.path.join(output_dir, f"{base_name}_hierarchy.md")
    with open(hierarchy_path, 'w', encoding='utf-8') as f:
        f.write("# Conversation Question Hierarchy\n\n")

        for i, (q, _, followups) in enumerate(qa_pairs, 1):
            f.write(f"## Q{i}\n{q}\n\n")
            if followups:
                f.write("**Follow-ups / Recommended:**\n")
                for j, fu in enumerate(followups, 1):
                    f.write(f"- **Q{i}.{j}**  {fu}\n")
            f.write("\n---\n\n")

        f.write("## All Main Questions (flat)\n\n")
        for i, q in enumerate(main_questions, 1):
            f.write(f"- Q{i}: {q}\n")

    print(f"\nHierarchy file: {Path(hierarchy_path).name}")
    print("Processing complete.")


if __name__ == "__main__":
    INPUT_FILE = r"C:\Users\GRL\Downloads\Synthetic Data in USB-PD.md"
    OUTPUT_FOLDER = "extracted_usb_pd"

    print("Starting processing...")
    process_file(INPUT_FILE, OUTPUT_FOLDER)
    print("Done.")

Starting processing...
File loaded: 55,704 characters
First 800 characters of cleaned content:
title: "#Synthetic Data in USB-PD"
source: "https://chatgpt.com/c/6971e805-20ec-8323-854a-caecf89489d2"

**You:**

Wat is Synthetic data in model training,   
Explain with USB pd domain agent model training example

**You:**

Wat is Synthetic data in model training,
Explain with USB pd domain agent model training example

**ChatGPT:**

Synthetic data, in model training, is **artificially generated data that mimics real-world data**‚Äîits structure, patterns, edge cases, and constraints‚Äîwithout being directly collected from real systems or users.

It‚Äôs not random data. Good synthetic data is **domain-faithful**, **statistically realistic**, and **behaviorally meaningful** for the task the model is learning.

## Why synthetic data is used (quick context)

In many engineering domains (USB-PD in
--------------------------------------------------------------------------------
Done.


In [18]:
import re
import os
from pathlib import Path

def clean_content(text: str) -> str:
    """Remove frontmatter, junk lines, normalize spacing"""
    # Remove YAML frontmatter
    text = re.sub(r'^---\s*$.*?^---\s*$', '', text, flags=re.MULTILINE | re.DOTALL)
    
    # Remove * * * and --- separator lines
    text = re.sub(r'^\s*\* \* \*\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    
    # Normalize multiple newlines
    text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
    return text


def extract_followups(answer_text: str) -> list:
    """Try to find follow-up / next-step questions in ChatGPT response"""
    lines = answer_text.split('\n')
    followups = []
    in_followup_section = False

    for line in lines:
        s = line.strip()
        lower = s.lower()

        if not in_followup_section:
            if any(p in lower for p in [
                'if you want next', 'next we can', 'if you want, next i can',
                'want next', 'next, i can', 'next steps', 'next question',
                'next i can', 'shall we', 'my opinion', 'if you want'
            ]):
                in_followup_section = True
            continue

        # Look for list items or question-like lines
        if s and (
            s.startswith(('- ', '* ', '1. ', '2. ', '3. ', '‚Ä¢ ')) or
            (s.endswith('?') and len(s) > 15) or
            any(kw in lower for kw in ['design', 'show', 'draft', 'map', 'help', 'convert'])
        ):
            cleaned = re.sub(r'^[-*‚Ä¢\d.\s]+|\*\*', '', s).strip()
            if cleaned and cleaned not in followups:
                followups.append(cleaned)

        if len(followups) >= 3:
            break

    return followups[:3]


def extract_conversation(filepath: str):
    """Extract questions and answers from the specific markdown format"""
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return []

    print("Reading file...")
    with open(filepath, encoding='utf-8') as f:
        raw = f.read()

    print(f"File size: {len(raw):,} characters")

    content = clean_content(raw)

    # Split on **You:** lines (start of each question)
    # We use positive lookbehind to keep the marker
    parts = re.split(r'(?=\*\*You\*\*:)', content)

    questions = []
    current_answer = []

    for part in parts:
        part = part.strip()
        if not part:
            continue

        if part.startswith("**You**:"):
            # If we have previous question + answer ‚Üí save it
            if questions and current_answer:
                prev_q, prev_a = questions[-1]
                followups = extract_followups(prev_a)
                questions[-1] = (prev_q, prev_a, followups)

            # Start new question
            q_text = part[8:].strip()  # remove **You**:
            q_text = re.sub(r'\s+', ' ', q_text).strip()
            questions.append([q_text, ""])   # [question, answer]
            current_answer = []

        else:
            # Everything else belongs to the previous answer
            if questions:
                current_answer.append(part)

    # Save last answer
    if questions and current_answer:
        last_q, _ = questions[-1]
        full_answer = '\n\n'.join(current_answer).strip()
        followups = extract_followups(full_answer)
        questions[-1] = (last_q, full_answer, followups)

    # Remove duplicate questions (very common in your file)
    seen = set()
    unique = []
    for q, a, f in questions:
        if q not in seen:
            seen.add(q)
            unique.append((q, a, f))

    print(f"Found {len(unique)} unique questions")
    return unique


def save_results(qa_list, input_path, output_dir):
    base = Path(input_path).stem
    os.makedirs(output_dir, exist_ok=True)

    # Individual files
    for i, (q, a, followups) in enumerate(qa_list, 1):
        fname = f"{base}_Q{i}.md"
        path = os.path.join(output_dir, fname)

        with open(path, 'w', encoding='utf-8') as f:
            f.write(f"# Q{i}\n\n")
            f.write(f"**Question**\n\n{q}\n\n")
            f.write(f"**Answer**\n\n{a}\n\n")
            if followups:
                f.write("**Follow-ups / Next steps:**\n")
                for j, fu in enumerate(followups, 1):
                    f.write(f"- {fu}\n")

        print(f"Created: {fname}")

    # Hierarchy overview
    hierarchy_path = os.path.join(output_dir, f"{base}_hierarchy.md")
    with open(hierarchy_path, 'w', encoding='utf-8') as f:
        f.write("# Conversation Hierarchy\n\n")

        for i, (q, _, followups) in enumerate(qa_list, 1):
            f.write(f"## Q{i}\n")
            f.write(f"{q}\n\n")
            if followups:
                f.write("**Follow-ups / Recommended:**\n")
                for j, fu in enumerate(followups, 1):
                    f.write(f"- **Q{i}.{j}**  {fu}\n")
            f.write("\n---\n\n")

        f.write("## All questions (flat list)\n\n")
        for i, (q, _, _) in enumerate(qa_list, 1):
            f.write(f"- Q{i}: {q}\n")

    print(f"\nHierarchy file: {Path(hierarchy_path).name}")
    print("All files saved.")


# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
#  MAIN
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

if __name__ == "__main__":
    INPUT_FILE = r"C:\Users\GRL\Downloads\Synthetic Data in USB-PD.md"
    OUTPUT_DIR = "extracted_questions"

    print("Starting extraction...\n")
    qa_pairs = extract_conversation(INPUT_FILE)

    if qa_pairs:
        print("\nSaving results...")
        save_results(qa_pairs, INPUT_FILE, OUTPUT_DIR)
        print("\nDone.")
    else:
        print("\nNo questions found. Please check the file content.")

Starting extraction...

Reading file...
File size: 55,704 characters
Found 0 unique questions

No questions found. Please check the file content.


In [22]:
from pathlib import Path
import re
from typing import List, Tuple

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# More robust patterns ‚Äì matches your exact file format
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

USER_PATTERNS = [
    r"^\*\*You:\*\*",           # **You:**
    r"^\*\*You:\s",             # **You:** (with space after :)
    r"^\*\*User:\*\*",
    r"^\*\*Human:\*\*",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*ChatGPT:\s",         # **ChatGPT:** (with space)
    r"^\*\*Assistant:\*\*",
    r"^\*\*Grok:\*\*",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

SEPARATOR_RE = re.compile(r"^(\* \* \*|---)$", re.IGNORECASE)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Split conversation into (user, assistant) pairs
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    pairs = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            user_text = "\n".join(current_user).strip()
            assistant_text = "\n".join(current_assistant).strip()
            # Only add if there's meaningful content
            if user_text and (assistant_text or len(current_assistant) > 0):
                pairs.append((user_text, assistant_text))

    i = 0
    # Skip YAML frontmatter
    if lines and lines[0].strip() == '---':
        i = 1
        while i < len(lines) and not lines[i].strip() == '---':
            i += 1
        i += 1  # skip closing ---

    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        if USER_RE.match(line):
            flush()
            current_role = "user"
            current_user = []
            current_assistant = []
            i += 1
            continue

        if ASSISTANT_RE.match(line):
            flush()
            current_role = "assistant"
            current_user = []
            current_assistant = []
            i += 1
            continue

        if SEPARATOR_RE.match(stripped):
            flush()
            current_role = None
            i += 1
            continue

        # Add line to current block (but skip the role marker lines themselves)
        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

        i += 1

    flush()
    return pairs

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Extract suggested / follow-up questions
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def extract_suggested_questions(text: str) -> List[str]:
    if not text.strip():
        return []

    suggestions = set()

    lines = [l.strip() for l in text.splitlines() if l.strip()]

    # Bullet points and numbered lists
    for line in lines:
        if re.match(r"^[-*‚Ä¢]|\d+[.)]\s+", line):
            clean = re.sub(r"^[-*‚Ä¢]|\d+[.)]\s*", "", line).strip()
            if len(clean) > 6:
                suggestions.add(clean)

    # Sentences ending with ?
    for line in lines:
        if line.endswith("?") and len(line) > 15:
            suggestions.add(line)

    # Lines starting with common suggestion verbs / phrases
    suggestion_starts = re.compile(
        r"^(next|try|you can|consider|also|would you like|want to|how about|what if|can you|should|let\'?s|another|more|extend|improve|optimize|debug|test|analyze|design|show|explain|add)",
        re.IGNORECASE
    )
    for line in lines:
        if suggestion_starts.match(line) and len(line) > 20:
            suggestions.add(line)

    # After headers like "If you want, next we can:"
    capture = False
    for line in lines:
        lower = line.lower()
        if "if you want" in lower or "next we can" in lower or "next:" in lower:
            capture = True
            continue
        if capture and (line.startswith("- ") or line.startswith("* ") or re.match(r"^\d+[.)] ", line)):
            clean = re.sub(r"^[-*‚Ä¢]|\d+[.)]\s*", "", line).strip()
            if clean:
                suggestions.add(clean)
        if capture and len(line.strip()) > 0 and not line.strip().startswith(('-','*','1.','2.','3.')):
            capture = False

    # Sort by length (prefer longer/more specific ones first)
    return sorted(list(suggestions), key=len, reverse=True)[:15]


# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Write outputs
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md ‚îÄ‚îÄ only questions + AI suggestions
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Questions + AI Suggested Follow-ups\n\n")

        for i, (question, answer) in enumerate(pairs, 1):
            if not question.strip():
                continue

            f.write(f"## Q{i}\n\n")
            f.write(question.strip() + "\n\n")

            suggestions = extract_suggested_questions(answer)
            if suggestions:
                f.write("**AI suggested follow-ups:**\n\n")
                for j, sug in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j}  {sug}\n")
                f.write("\n")
            else:
                f.write("_No clear follow-up suggestions detected_\n\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        if not q.strip():
            continue

        filepath = output_dir / f"Q{i:03d}.md"
        with filepath.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User\n\n" + q.strip() + "\n\n")
            f.write("## ChatGPT\n\n")
            if a.strip():
                f.write(a.strip() + "\n\n")
            else:
                f.write("_No answer captured_\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## AI Suggested Follow-ups\n\n")
                for j, sug in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j}  {sug}\n")


# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Main
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def main():
    path_str = input("Enter path to the .md conversation file: ").strip().strip('"')
    file_path = Path(path_str)

    if not file_path.is_file():
        print("‚ùå File not found.")
        return

    print(f"Reading: {file_path}")
    text = file_path.read_text(encoding="utf-8", errors="replace")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è  No Q&A pairs detected.")
        print("First 600 characters of file:")
        print("-" * 70)
        print(text[:600])
        print("-" * 70)
        print("Check if the role markers are different than expected.")
        return

    output_dir = file_path.parent / "extracted_questions"
    write_outputs(pairs, output_dir)

    print("\n" + "‚ïê" * 70)
    print(f"Success! Found {len(pairs)} question-answer pairs")
    print(f"Output folder: {output_dir.resolve()}")
    print("‚Üí questions_only.md contains questions + follow-up suggestions only")
    print("‚ïê" * 70)


if __name__ == "__main__":
    main()

Reading: C:\Users\GRL\Downloads\Synthetic Data in USB-PD.md
‚ö†Ô∏è  No Q&A pairs detected.
First 600 characters of file:
----------------------------------------------------------------------
---
title: "#Synthetic Data in USB-PD"
source: "https://chatgpt.com/c/6971e805-20ec-8323-854a-caecf89489d2"
---

**You:**

Wat is Synthetic data in model training,   
Explain with USB pd domain agent model training example

* * *

**You:**

Wat is Synthetic data in model training,
Explain with USB pd domain agent model training example

* * *

**ChatGPT:**

Synthetic data, in model training, is **artificially generated data that mimics real-world data**‚Äîits structure, patterns, edge cases, and constraints‚Äîwithout being directly collected from real systems or users.

It‚Äôs not random data. Good 
----------------------------------------------------------------------
Check if the role markers are different than expected.


In [None]:
from pathlib import Path
import re
import json
import requests
from typing import List, Tuple, Dict
from datetime import datetime

# ---------- Ollama Config ----------
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "qwen2.5:3b"  # fast & light
MAX_CHARS = 2000  # limit text sent to model

# ---------- Role markers ----------
USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Split conversation ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        s = line.strip()

        if USER_RE.match(s):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(s):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- AI Extraction (with caching) ----------

def extract_suggestions_with_ai(answer_text: str, cache: Dict[str, List[str]]) -> List[str]:
    key = answer_text.strip()
    if not key:
        return []

    if key in cache:
        return cache[key]

    text = key
    if len(text) > MAX_CHARS:
        text = text[:MAX_CHARS]

    prompt = (
        "Extract all follow-up or suggested questions from the text below.\n"
        "Return ONLY a JSON array of strings.\n"
        "If none, return [].\n\n"
        "Text:\n"
        '"""\n'
        f"{text}\n"
        '"""\n'
    )

    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "temperature": 0
    }

    try:
        resp = requests.post(OLLAMA_URL, json=payload, timeout=300)
        resp.raise_for_status()
        data = resp.json()
        out = data.get("response", "").strip()
    except Exception as e:
        print("‚ö†Ô∏è Ollama request failed:", e)
        cache[key] = []
        return []

    # Try to extract JSON array safely
    suggestions: List[str] = []
    try:
        start = out.find("[")
        end = out.rfind("]") + 1
        if start != -1 and end != -1:
            arr = json.loads(out[start:end])
            if isinstance(arr, list):
                suggestions = [s.strip() for s in arr if isinstance(s, str) and s.strip()]
    except Exception:
        suggestions = []

    cache[key] = suggestions
    return suggestions

# ---------- Write outputs ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)

    cache: Dict[str, List[str]] = {}

    # Precompute suggestions
    all_suggestions = []
    for q, a in pairs:
        sugg = extract_suggestions_with_ai(a, cache)
        all_suggestions.append(sugg)

    # questions_only.md
    questions_only = output_dir / "questions_only.md"
    with questions_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions (AI Extracted)\n\n")

        for i, ((q, a), suggestions) in enumerate(zip(pairs, all_suggestions), 1):
            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual Q&A files
    for i, ((q, a), suggestions) in enumerate(zip(pairs, all_suggestions), 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    input_path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(input_path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")
    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected.")
        return

    base_dir = Path(r"D:\Balaji-workbench\synthetic data")
    run_name = datetime.now().strftime("output_%Y%m%d_%H%M%S")
    output_dir = base_dir / run_name

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()
