In [9]:
from pathlib import Path
import re
from typing import List, Tuple
from markdown_it import MarkdownIt

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Core Parsing ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    """
    Returns list of (user_text, assistant_text)
    """
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        if USER_RE.match(line.strip()):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []

            current_role = "user"
            continue

        if ASSISTANT_RE.match(line.strip()):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Suggested Question Extraction ----------

def extract_suggested_questions(text: str) -> List[str]:
    suggestions = []
    for line in text.splitlines():
        line = line.strip()

        # Bullet points or suggestion-like lines
        if line.startswith(("-", "*", "‚Ä¢")):
            clean = re.sub(r"^[\-*‚Ä¢]\s*", "", line)
            if "?" in clean or clean.lower().startswith(
                ("add", "design", "show", "map", "convert", "draw", "build", "create", "explain")
            ):
                suggestions.append(clean)

    return suggestions

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Questions + Suggested Questions\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("### Suggested follow-up questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")
                f.write("\n")


    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    output_dir = Path(r"D:\Balaji-workbench\synthetic data")

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Extracted {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Extracted 9 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data


In [1]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Core Parsing ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        line_stripped = line.strip()

        if USER_RE.match(line_stripped):
            if current_role is not None:
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line_stripped):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Accurate Suggested Block Extraction ----------

def extract_suggested_block(text: str) -> List[str]:
    """
    Finds blocks like:
    If you want, next I can:
    * ...
    * ...
    * ...

    Returns list of bullet lines.
    """
    lines = text.splitlines()
    collected = []

    for i, line in enumerate(lines):
        if re.search(r"(if you want|next i can|suggest|follow[- ]?up)", line, re.IGNORECASE):
            j = i + 1
            while j < len(lines):
                l = lines[j].rstrip()
                if l.strip().startswith(("*", "-", "‚Ä¢")):
                    collected.append(l.strip())
                    j += 1
                else:
                    break
            if collected:
                break

    return collected

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)

    # -------- questions_only.md --------
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Questions\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            f.write(f"## Q{i}\n")
            f.write(q.strip() + "\n\n")

            suggestions = extract_suggested_block(a)
            for j, s in enumerate(suggestions, 1):
                clean = re.sub(r"^[\-\*‚Ä¢]\s*", "", s)
                f.write(f"Q{i}.{j} {clean}\n")

            f.write("\n")

    # -------- Individual QA files (unchanged) --------
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    base_dir = Path(r"D:\Balaji-workbench\synthetic data")
    output_dir = base_dir / "output"

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Extracted {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Extracted 10 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data\output


In [4]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Conversation Splitter ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        if USER_RE.match(line.strip()):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line.strip()):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Strong Suggested Question Extraction ----------

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = set()

    # 1) Bullet points and numbered lists
    for line in answer_text.splitlines():
        line = line.strip()
        if re.match(r"^[-*‚Ä¢]\s+", line) or re.match(r"^\d+[\.\)]\s+", line):
            clean = re.sub(r"^([-*‚Ä¢]|\d+[\.\)])\s*", "", line)
            if "?" in clean:
                suggestions.add(clean.strip())

    # 2) Any sentence that ends with ?
    sentences = re.split(r"(?<=[.!?])\s+", answer_text)
    for s in sentences:
        s = s.strip()
        if s.endswith("?") and len(s) > 5:
            suggestions.add(s)

    # 3) Patterns like: "You can ask:", "Follow-up questions:", etc.
    trigger_patterns = [
        r"you can ask[:,]?",
        r"follow[- ]?up questions?[:,]?",
        r"suggested questions?[:,]?",
        r"you might ask[:,]?",
        r"consider[:,]?",
        r"next[:,]?",
    ]

    lower = answer_text.lower()
    for pat in trigger_patterns:
        if re.search(pat, lower):
            # Grab nearby questions
            for s in sentences:
                s = s.strip()
                if s.endswith("?") and len(s) > 5:
                    suggestions.add(s)

    # Clean and return
    cleaned = []
    for s in suggestions:
        s = re.sub(r"\s+", " ", s).strip()
        cleaned.append(s)

    return sorted(cleaned)

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Questions and Follow-up Questions\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            if suggestions:
                f.write("\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    output_dir = file_path.parent / "output_md"
    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Extracted {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()



‚úÖ Done!
üìÑ Extracted 9 Q&A pairs
üìÅ Output folder: C:\Users\GRL\Downloads\output_md


In [1]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Conversation Splitter ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        line_stripped = line.strip()

        if USER_RE.match(line_stripped):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line_stripped):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Aggressive Suggested Question Extraction ----------

VERB_START_RE = re.compile(
    r"^(add|design|show|explain|compare|build|create|implement|analyze|generate|map|convert|optimize|test|debug|evaluate|extend|improve|refactor|integrate|deploy|document|benchmark|profile|secure|validate)\b",
    re.IGNORECASE
)

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = []

    lines = [l.strip() for l in answer_text.splitlines() if l.strip()]

    # 1) Bullet points and numbered lists (ALWAYS include)
    for line in lines:
        if re.match(r"^[-*‚Ä¢]\s+", line) or re.match(r"^\d+[\.\)]\s+", line):
            clean = re.sub(r"^([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
            if len(clean) > 3:
                suggestions.append(clean)

    # 2) Any sentence that ends with ?
    sentences = re.split(r"(?<=[.!?])\s+", answer_text)
    for s in sentences:
        s = s.strip()
        if s.endswith("?") and len(s) > 5:
            suggestions.append(s)

    # 3) Lines that look like tasks / follow-ups (verb at start)
    for line in lines:
        if VERB_START_RE.match(line):
            suggestions.append(line)

    # 4) Lines after trigger headers
    trigger_headers = [
        "next", "next steps", "follow", "follow-up", "suggest", "you can", "consider", "try", "ideas"
    ]

    capture = False
    for line in lines:
        low = line.lower()
        if any(h in low for h in trigger_headers):
            capture = True
            continue

        if capture:
            if re.match(r"^[-*‚Ä¢]\s+", line) or re.match(r"^\d+[\.\)]\s+", line):
                clean = re.sub(r"^([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
                if len(clean) > 3:
                    suggestions.append(clean)
            else:
                # stop capturing if normal paragraph resumes
                capture = False

    # Clean, deduplicate, normalize
    cleaned = []
    seen = set()
    for s in suggestions:
        s = re.sub(r"\s+", " ", s).strip()
        if len(s) < 5:
            continue
        key = s.lower()
        if key not in seen:
            seen.add(key)
            cleaned.append(s)

    return cleaned

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Questions and Follow-up Questions\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            if suggestions:
                f.write("\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    output_dir = file_path.parent / "output_md"
    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Extracted {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Extracted 9 Q&A pairs
üìÅ Output folder: C:\Users\GRL\Downloads\output_md


In [2]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Conversation Splitter ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        line_stripped = line.strip()

        if USER_RE.match(line_stripped):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line_stripped):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Precise Suggested Question Extraction ----------

SECTION_TRIGGERS = [
    "suggested questions",
    "follow-up questions",
    "follow up questions",
    "next questions",
    "you can ask",
    "you might ask",
    "consider asking",
]

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = []
    seen = set()

    lines = [l.strip() for l in answer_text.splitlines() if l.strip()]

    capture_mode = False

    for line in lines:
        low = line.lower()

        # Detect section headers
        if any(t in low for t in SECTION_TRIGGERS):
            capture_mode = True
            continue

        # Bullet or numbered list items
        is_list_item = bool(re.match(r"^[-*‚Ä¢]\s+", line) or re.match(r"^\d+[\.\)]\s+", line))

        if is_list_item:
            clean = re.sub(r"^([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
            if len(clean) > 5:
                # Keep only real questions or question-like prompts
                if clean.endswith("?") or "?" in clean:
                    key = clean.lower()
                    if key not in seen:
                        seen.add(key)
                        suggestions.append(clean)
            continue

        # If we are in a "suggestions" section, also capture question sentences
        if capture_mode:
            if line.endswith("?"):
                key = line.lower()
                if key not in seen:
                    seen.add(key)
                    suggestions.append(line)
            else:
                # stop when normal paragraph resumes
                capture_mode = False

    return suggestions

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions Only\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            suggestions = extract_suggested_questions(a)

            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    output_dir = Path(r"D:\Balaji-workbench\synthetic data")

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Processed 9 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data


In [4]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Role markers (supports many formats) ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Split conversation into (user, assistant) pairs ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        s = line.strip()

        if USER_RE.match(s):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(s):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Extract ONLY suggested / follow-up questions ----------

SECTION_TRIGGERS = [
    "suggested questions",
    "follow-up questions",
    "follow up questions",
    "next questions",
    "you can ask",
    "you might ask",
    "consider asking",
]

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = []
    seen = set()

    lines = [l.strip() for l in answer_text.splitlines() if l.strip()]

    capture_mode = False

    for line in lines:
        low = line.lower()

        # Detect suggestion sections
        if any(t in low for t in SECTION_TRIGGERS):
            capture_mode = True
            continue

        # Bullet or numbered list item
        is_list_item = bool(
            re.match(r"^[-*‚Ä¢]\s+", line) or re.match(r"^\d+[\.\)]\s+", line)
        )

        if is_list_item:
            clean = re.sub(r"^([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
            # Keep ONLY real questions
            if clean.endswith("?"):
                key = clean.lower()
                if key not in seen:
                    seen.add(key)
                    suggestions.append(clean)
            continue

        # If inside a suggestion section, capture question sentences
        if capture_mode:
            if line.endswith("?"):
                key = line.lower()
                if key not in seen:
                    seen.add(key)
                    suggestions.append(line)
            else:
                # stop when normal text resumes
                capture_mode = False

    return suggestions

# ---------- Write outputs ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions Only\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            suggestions = extract_suggested_questions(a)
            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual Q&A files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)
    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected.")
        return

    output_dir = Path(r"D:\Balaji-workbench\synthetic data")
    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Processed 9 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data


In [None]:
from pathlib import Path
import re
import json
import requests
from typing import List, Tuple, Dict

# ---------- Ollama Config ----------
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "qwen2.5:3b"  # faster model

MAX_CHARS = 2000  # limit text sent to model for speed

# ---------- Role markers ----------
USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Split conversation ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        s = line.strip()

        if USER_RE.match(s):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(s):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- AI Extraction (with caching) ----------

def extract_suggestions_with_ai(answer_text: str, cache: Dict[str, List[str]]) -> List[str]:
    key = answer_text.strip()
    if key in cache:
        return cache[key]

    text = answer_text.strip()
    if len(text) > MAX_CHARS:
        text = text[:MAX_CHARS]

    prompt = f"""
Extract all follow-up or suggested questions from the text below.
Return ONLY a JSON array of strings.
If none, return [].

Text:
\"\"\"
{text}
\"\"\"
"""

    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "temperature": 0
    }

    resp = requests.post(OLLAMA_URL, json=payload, timeout=300)
    resp.raise_for_status()

    data = resp.json()
    out = data.get("response", "").strip()

    suggestions = []
    try:
        start = out.find("[")
        end = out.rfind("]") + 1
        arr = json.loads(out[start:end])
        if isinstance(arr, list):
            suggestions = [s.strip() for s in arr if isinstance(s, str) and s.strip()]
    except Exception:
        suggestions = []

    cache[key] = suggestions
    return suggestions

# ---------- Write outputs ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)

    cache: Dict[str, List[str]] = {}

    # Precompute suggestions once
    all_suggestions = []
    for q, a in pairs:
        sugg = extract_suggestions_with_ai(a, cache)
        all_suggestions.append(sugg)

    # questions_only.md
    questions_only = output_dir / "questions_only.md"
    with questions_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions (AI Extracted)\n\n")

        for i, ((q, a), suggestions) in enumerate(zip(pairs, all_suggestions), 1):
            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual Q&A files
    for i, ((q, a), suggestions) in enumerate(zip(pairs, all_suggestions), 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    input_path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(input_path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")
    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected.")
        return

    output_dir = Path(r"D:\Balaji-workbench\synthetic data")
    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()
