In [1]:
from pathlib import Path
import re
from typing import List, Tuple

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*User:\*\*",
    r"^User:",
    r"^You:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*Assistant:\*\*",
    r"^Assistant:",
    r"^ChatGPT:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Conversation Splitter ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    for line in lines:
        line_stripped = line.strip()

        if USER_RE.match(line_stripped):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line_stripped):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Suggested Question Extraction ----------

SECTION_TRIGGERS = [
    "suggested questions",
    "follow-up questions",
    "follow up questions",
    "next questions",
    "you can ask",
    "you might ask",
    "consider asking",
    "if you want next",
    "i can:",
    "If you want, we can",
    "If you‚Äôd like, we can",
    "If it helps, we can",
    "Whenever you want, we can",
    "When you‚Äôre ready, we can",
    "Next, we can",
    "Next steps",
    "From here, we can",
    "Moving forward, we can",
    "Going forward, we can",
    "Here‚Äôs what we can do next",
    "Let‚Äôs continue with",
    "Let‚Äôs move on to",
    "Let‚Äôs start with",
    "Let‚Äôs begin with",
    "Let‚Äôs go with",
    "We could also",
    "You can also",
    "Another option is",
    "Alternatively",
    "Or we can",
    "You might want to",
    "Feel free to",
    "I suggest we",
    "I recommend we",
    "A good next step is",
    "The best next step is",
    "We should",
    "Let‚Äôs try",
    "Let‚Äôs explore",
    "Let‚Äôs build",
    "Let‚Äôs design",
    "Let‚Äôs define",
    "Let‚Äôs draft",

    # Slightly more formal / structured style
    "Subsequent steps include",
    "The next phase involves",
    "The following steps are proposed",
    "Proposed next steps",
    "Continuation options include",
    "The process can proceed by",
    "The workflow continues with",
    "Next actions",
    "Action items",
    "Roadmap items",
    "Planned next steps",
    "Implementation steps",

    # Very short / casual triggers (use with caution)
    "Next:",
    "Continue:",
    "Proceed:",
    "Start:",
    "Choose:",
    "Pick one:",
    "Select an option:",
    "Go ahead:",
    "Try this:",
    "Explore:",
    "More:",
    "See options:",
    "Want to:",
    "How about",
    "What do you want to do next?",
    "Where should we go next?",
    "Your move:",
    "Ready to:",
    "Shall we:",
    
]

def strip_markdown(text: str) -> str:
    # Remove **bold**, *italic*, __underline__, etc.
    text = re.sub(r"(\*\*|\*|__|_)", "", text)
    return text.strip()

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = []
    seen = set()

    lines = [l.rstrip() for l in answer_text.splitlines() if l.strip()]

    capture_mode = False

    for line in lines:
        low = line.lower().strip()

        # Detect section headers like "If you want next, I can:"
        if any(t in low for t in SECTION_TRIGGERS):
            capture_mode = True
            continue

        # Bullet or numbered list items
        is_list_item = bool(
            re.match(r"^\s*[-*‚Ä¢]\s+", line) or re.match(r"^\s*\d+[\.\)]\s+", line)
        )

        if capture_mode and is_list_item:
            clean = re.sub(r"^\s*([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
            clean = strip_markdown(clean)

            # Remove leading "or "
            if clean.lower().startswith("or "):
                clean = clean[3:].strip()

            if len(clean) > 5:
                # Turn into a question
                # Make first letter lowercase after "Can you"
                q_body = clean[0].lower() + clean[1:]
                q = "Can you " + q_body
                if not q.endswith("?"):
                    q += "?"

                key = q.lower()
                if key not in seen:
                    seen.add(key)
                    suggestions.append(q)
            continue

        # If we leave the bullet list, stop capture mode
        if capture_mode and not is_list_item:
            capture_mode = False

    return suggestions

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions Only\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            suggestions = extract_suggested_questions(a)

            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")

# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    output_dir = Path(r"D:\Balaji-workbench\synthetic data")

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")

if __name__ == "__main__":
    main()


‚úÖ Done!
üìÑ Processed 9 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data


In [5]:
from pathlib import Path
import re
from typing import List, Tuple
from datetime import datetime          # ‚Üê Added this line

# ---------- Patterns for common conversation formats ----------

USER_PATTERNS = [
    r"^\*\*You:\*\*",
    r"^\*\*You:\s",                    # matches **You:** with space
    r"^\*\*User:\*\*",
    r"^\*\*Human:\*\*",
    r"^You:",
    r"^User:",
    r"^##\s*User",
    r"^##\s*You",
]

ASSISTANT_PATTERNS = [
    r"^\*\*ChatGPT:\*\*",
    r"^\*\*ChatGPT:\s",                # matches **ChatGPT:** with space
    r"^\*\*Assistant:\*\*",
    r"^\*\*Grok:\*\*",
    r"^ChatGPT:",
    r"^Assistant:",
    r"^##\s*Assistant",
    r"^##\s*ChatGPT",
]

USER_RE = re.compile("|".join(USER_PATTERNS), re.IGNORECASE)
ASSISTANT_RE = re.compile("|".join(ASSISTANT_PATTERNS), re.IGNORECASE)

# ---------- Conversation Splitter ----------

def split_conversation(md_text: str) -> List[Tuple[str, str]]:
    lines = md_text.splitlines()

    blocks = []
    current_role = None
    current_user = []
    current_assistant = []

    def flush():
        if current_user or current_assistant:
            blocks.append((
                "\n".join(current_user).strip(),
                "\n".join(current_assistant).strip()
            ))

    # Skip YAML frontmatter if present
    i = 0
    if lines and lines[0].strip() == '---':
        i = 1
        while i < len(lines) and lines[i].strip() != '---':
            i += 1
        i += 1  # skip closing ---

    for line in lines[i:]:
        line_stripped = line.strip()

        if USER_RE.match(line_stripped):
            if current_role == "assistant":
                flush()
                current_user = []
                current_assistant = []
            current_role = "user"
            continue

        if ASSISTANT_RE.match(line_stripped):
            current_role = "assistant"
            continue

        if current_role == "user":
            current_user.append(line)
        elif current_role == "assistant":
            current_assistant.append(line)

    flush()
    return [(u, a) for u, a in blocks if u.strip() or a.strip()]

# ---------- Suggested Question Extraction ----------

SECTION_TRIGGERS = [
    "suggested questions",
    "follow-up questions",
    "follow up questions",
    "next questions",
    "you can ask",
    "you might ask",
    "consider asking",
    "if you want next",
    "i can:",
    "If you want, we can",
    "If you‚Äôd like, we can",
    "If it helps, we can",
    "Whenever you want, we can",
    "When you‚Äôre ready, we can",
    "Next, we can",
    "Next steps",
    "From here, we can",
    "Moving forward, we can",
    "Going forward, we can",
    "Here‚Äôs what we can do next",
    "Let‚Äôs continue with",
    "Let‚Äôs move on to",
    "Let‚Äôs start with",
    "Let‚Äôs begin with",
    "Let‚Äôs go with",
    "We could also",
    "You can also",
    "Another option is",
    "Alternatively",
    "Or we can",
    "You might want to",
    "Feel free to",
    "I suggest we",
    "I recommend we",
    "A good next step is",
    "The best next step is",
    "We should",
    "Let‚Äôs try",
    "Let‚Äôs explore",
    "Let‚Äôs build",
    "Let‚Äôs design",
    "Let‚Äôs define",
    "Let‚Äôs draft",
    "Subsequent steps include",
    "The next phase involves",
    "The following steps are proposed",
    "Proposed next steps",
    "Continuation options include",
    "The process can proceed by",
    "The workflow continues with",
    "Next actions",
    "Action items",
    "Roadmap items",
    "Planned next steps",
    "Implementation steps",
    "Next:",
    "Continue:",
    "Proceed:",
    "Start:",
    "Choose:",
    "Pick one:",
    "Select an option:",
    "Go ahead:",
    "Try this:",
    "Explore:",
    "More:",
    "See options:",
    "Want to:",
    "How about",
    "What do you want to do next?",
    "Where should we go next?",
    "Your move:",
    "Ready to:",
    "Shall we:",
]

def strip_markdown(text: str) -> str:
    text = re.sub(r"(\*\*|\*|__|_)", "", text)
    return text.strip()

def extract_suggested_questions(answer_text: str) -> List[str]:
    suggestions = []
    seen = set()

    lines = [l.rstrip() for l in answer_text.splitlines() if l.strip()]

    capture_mode = False

    for line in lines:
        low = line.lower().strip()

        # Detect section headers / suggestion triggers
        if any(t.lower() in low for t in SECTION_TRIGGERS):
            capture_mode = True
            continue

        # Bullet or numbered list items
        is_list_item = bool(
            re.match(r"^\s*[-*‚Ä¢]\s+", line) or re.match(r"^\s*\d+[\.\)]\s+", line)
        )

        if capture_mode and is_list_item:
            clean = re.sub(r"^\s*([-*‚Ä¢]|\d+[\.\)])\s*", "", line).strip()
            clean = strip_markdown(clean)

            # Remove leading "or "
            if clean.lower().startswith("or "):
                clean = clean[3:].strip()

            if len(clean) > 5:
                # Turn into a question
                q_body = clean[0].lower() + clean[1:]
                q = "Can you " + q_body
                if not q.endswith("?"):
                    q += "?"

                key = q.lower()
                if key not in seen:
                    seen.add(key)
                    suggestions.append(q)
            continue

        # If we leave the bullet list, stop capture mode
        if capture_mode and not is_list_item:
            capture_mode = False

    return suggestions

# ---------- Writers ----------

def write_outputs(pairs: List[Tuple[str, str]], output_dir: Path):
    output_dir.mkdir(exist_ok=True)

    # questions_only.md
    q_only = output_dir / "questions_only.md"
    with q_only.open("w", encoding="utf-8") as f:
        f.write("# Suggested / Follow-up Questions Only\n\n")

        for i, (q, a) in enumerate(pairs, 1):
            suggestions = extract_suggested_questions(a)

            if not suggestions:
                continue

            f.write(f"## Q{i}\n\n")
            f.write(q.strip() + "\n\n")

            for j, s in enumerate(suggestions, 1):
                f.write(f"- Q{i}.{j} {s}\n")

            f.write("\n")

    # Individual QA files
    for i, (q, a) in enumerate(pairs, 1):
        out = output_dir / f"Q{i:03d}.md"
        with out.open("w", encoding="utf-8") as f:
            f.write(f"# Question {i}\n\n")
            f.write("## User Question\n\n")
            f.write(q.strip() + "\n\n")
            f.write("## Assistant Answer\n\n")
            f.write(a.strip() + "\n\n")

            suggestions = extract_suggested_questions(a)
            if suggestions:
                f.write("## Suggested / Follow-up Questions\n\n")
                for j, s in enumerate(suggestions, 1):
                    f.write(f"- Q{i}.{j} {s}\n")


# ---------- Main ----------

def main():
    path = input("Enter path to conversation .md file: ").strip().strip('"')
    file_path = Path(path)

    if not file_path.exists():
        print("‚ùå File not found!")
        return

    text = file_path.read_text(encoding="utf-8", errors="ignore")

    pairs = split_conversation(text)

    if not pairs:
        print("‚ö†Ô∏è No conversation blocks detected. The format may be unknown.")
        return

    base_dir = Path(r"D:\Balaji-workbench\synthetic data")
    run_name = datetime.now().strftime("output_%Y%m%d_%H%M%S")
    output_dir = base_dir / run_name

    write_outputs(pairs, output_dir)

    print("‚úÖ Done!")
    print(f"üìÑ Processed {len(pairs)} Q&A pairs")
    print(f"üìÅ Output folder: {output_dir}")


if __name__ == "__main__":
    main()

‚úÖ Done!
üìÑ Processed 9 Q&A pairs
üìÅ Output folder: D:\Balaji-workbench\synthetic data\output_20260209_005018
