In [1]:
# ==========================================================
# PRD to BDD JSON Converter (using OpenAI GPT)
# Author: Arjun M S
# Purpose: Automatically extract BDD scenarios (Given/When/Then)
#          from Product Requirements Documents using LLMs
# ==========================================================

In [2]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m119.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, python-docx
Successfully installed lxml-6.0.2 python-docx-1.2.0


In [4]:
# !pip

In [5]:
import docx
import json
import re
# import openai
from pathlib import Path
from textwrap import shorten


In [6]:
!pip install google-genai



In [7]:
# Import Gemini / GenAI SDK
from google import genai
from google.genai import types

In [8]:

# Configure Your API Key
# openai.api_key = ""

client = genai.Client(api_key="AIzaSyCCpmOd-IzdQUXZdPrAdj0c943egv4ayrg")


# Read PRD File
def read_docx(file_path):
    """
    Extracts text from a .docx PRD file.

    Loops through every paragraph in the Word file (for p in doc.paragraphs)
    Takes the text of that paragraph (p.text)
    Removes any extra spaces at the start or end (.strip())
    Keeps only non-empty paragraphs (if p.text.strip())
    """
    doc = docx.Document(file_path)
    text = "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
    return text


In [9]:
# Split Large Documents into Manageable Chunks
def chunk_text(text, max_length=4000):
    """Split long text into smaller chunks for API processing."""
    sentences = re.split(r'(?<=[.!?])\s+', text) # look behind for a punctuation mark like ., !, or ?
    chunks, chunk = [], ""
    for s in sentences:
        if len(chunk) + len(s) < max_length: # Add sentence to current chunk (if not too long)
            chunk += " " + s
        else: # If it would exceed the limit, save the chunk
            chunks.append(chunk.strip())
            chunk = s
    if chunk: # Add the last leftover chunk (After the loop, if there’s any text left unsaved, add it to the list.)
        chunks.append(chunk.strip())
    return chunks



In [29]:
# Uses Gemini to extract Given/When/Then scenarios from text chunk.
def extract_bdd_from_chunk(chunk):

    prompt = f"""
You are a software analyst. Convert the following PRD section into a structured JSON of BDD (Behavior Driven Development) scenarios.

Each scenario should be in the format:
{{
  "given": "...",
  "when": "...",
  "then": "..."
}}

If multiple features or behaviors exist, create multiple scenarios.
Keep the output strictly valid JSON (no commentary, no markdown).

Text:
{chunk}
    """

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(response_mime_type="application/json")
    )

    print("=== TEXT OUT RESPONSE ===")
    try:
        text_out = response.candidates[0].content.parts[0].text.strip()
    except Exception:
        text_out = response.text or ""
    print(text_out)
    print("====================")

    # Try to parse structured output
    parsed = getattr(response, "parsed", None)
    if parsed:
        return parsed

    # If parsed is empty, use text_out fallback
    if text_out:
        cleaned = text_out.strip().strip("```json").strip("```")
        print("=== CLEANED RESPONSE ===")
        print(cleaned)
        print("====================")
        print
        try:
            return json.loads(cleaned)
        except Exception as e:
            print("⚠️ JSON parse failed:", e)
            return {"error": "Invalid JSON", "raw_output": cleaned[:300]}
    else:
        return {"error": "Empty response"}



In [30]:
# Combine All Scenarios
def prd_to_bdd_json(file_path):
    text = read_docx(file_path)
    chunks = chunk_text(text)

    print(f"Processing {len(chunks)} chunks...")

    all_features = []
    for i, chunk in enumerate(chunks, start=1):
        print(f"🔹 Analyzing chunk {i}/{len(chunks)}...")
        result = extract_bdd_from_chunk(chunk)

        if result is None:
            print(f"⚠️ Chunk {i} returned None — skipping")
            continue

        # to handle or normalize different possible output formats from the LLM
        # LLMs (like Gemini or GPT) don’t always respond exactly the same way
        if isinstance(result, dict) and "features" in result: # if the result is a dictionary and features is a key in that dictionary
            all_features.extend(result["features"])
        elif isinstance(result, list): # if the result is a List
            all_features.extend(result)
        else:
            all_features.append(result)

    bdd_data = {"features": all_features}
    return bdd_data




In [31]:
# Run Conversion
file_path = "Scribl — Product Requirements Document (PRD).docx"
bdd_json = prd_to_bdd_json(file_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  {
    "given": "A team has a maximum 'seatCount' according to its plan",
    "when": "An invited user accepts the invitation to join the team, and accepting would exceed the 'seatCount'",
    "then": "The system should prevent the user from joining and inform them that the team has reached its seat limit"
  },
  {
    "given": "A team has a maximum 'seatCount' according to its plan",
    "when": "An inactive user within the team is activated, and activating would exceed the 'seatCount'",
    "then": "The system should prevent the user from being activated and inform the team that the seat limit has been reached"
  },
  {
    "given": "A user performs a key business action (e.g., creating content, inviting a member, changing settings)",
    "when": "The action is successfully completed",
    "then": "An 'AuditLog' entry should be created, containing the actor, timestamp, action, and relevant metadata (including 'teamId' 

In [32]:
# Save Final JSON
output_path = Path("bdd_output_gemini.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(bdd_json, f, indent=2, ensure_ascii=False) # ensure_asci=False ; don’t convert non-English characters into escape codes. Keep them readable as they are.

print(f"\n✅ BDD JSON created: {output_path.resolve()}")


✅ BDD JSON created: /content/bdd_output_gemini.json


# Outputting the Results

In [33]:
import json
from google.colab import files

# Suppose your file name is "bdd_output_gemini.json"
file_name = "bdd_output_gemini.json"

# Read & parse JSON
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Pretty print the JSON (indentation)
print(json.dumps(data, indent=2, ensure_ascii=False))

{
  "features": [
    {
      "given": "A new user wants to sign up for Scribl",
      "when": "The user attempts to authenticate their account",
      "then": "The user is redirected to LinkedIn OAuth for authentication",
      "and": "A Scribl user record is created or updated based on the LinkedIn profile"
    },
    {
      "given": "A newly authenticated user is starting their first Scribl session",
      "when": "The user proceeds through the onboarding flow",
      "then": "The user is presented with a guided, multi-step process",
      "and": "The system automatically saves progress throughout the onboarding flow to prevent drop-offs"
    },
    {
      "given": "A 'Marketing Manager (Team Admin)' persona is managing team subscriptions",
      "when": "The admin selects the number of seats and a subscription plan for their team",
      "then": "A real-time calculator displays the pricing and associated features for the selected plan",
      "and": "Access to Scribl is provision

# SPLIT into different Categories


In [3]:
import os
import json
from pathlib import Path

# Load the generated master JSON
with open("bdd_output_gemini.json", "r", encoding="utf-8") as f:
    bdd_json = json.load(f)

# Create an output folder for the split files
output_dir = Path("bdd_output_split")
output_dir.mkdir(exist_ok=True)

# Define keywords to detect each domain
domain_map = {
    "auth": ["login", "signup", "password", "mfa", "authenticate", "logout"],
    "onboarding": ["onboard", "setup", "profile", "introduction"],
    "billing": ["payment", "invoice", "subscription", "refund", "billing", "checkout"],
    "content": ["post", "article", "draft", "generate", "ai", "editor", "caption"],
    "analytics": ["dashboard", "metrics", "insights", "report", "tracking"],
    "team": ["team", "workspace", "member", "invite"],
    "admin": ["admin", "role", "permission", "configuration", "superadmin"],
    "notifications": ["notification", "email", "message", "alert"],
    "workflow": ["workflow", "approval", "task", "automation"],
    "integration": ["api", "webhook", "integration", "connector"],
    "security": ["compliance", "encryption", "access", "tls", "policy"]
}

def detect_domain(scenario):
    """
    Detect which domain a scenario belongs to based on keyword matching.
    """
    text = (
        scenario.get("given", "") + " " +
        scenario.get("when", "") + " " +
        scenario.get("then", "")
    ).lower()

    # Less pythonic, more explicit
    for domain, keywords in domain_map.items():
        for keyword in keywords:
            if keyword in text:
                return domain

    return "misc"  # fallback if no match

# Keep a counter of how many scenarios per domain
domain_counts = {}

# Split and save scenarios by domain
for scenario in bdd_json.get("features", []):
    domain = detect_domain(scenario)
    file_path = output_dir / f"{domain}.json"

    # Load existing data if file already exists
    if file_path.exists():
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {"features": []}

    # Add scenario and save back
    data["features"].append(scenario)
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    # Count how many scenarios per domain
    if domain in domain_counts:
        domain_counts[domain] += 1
    else:
        domain_counts[domain] = 1

# === Summary Printout ===
print("\n✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/\n")
print("📊 Summary by category:\n")
for domain, count in sorted(domain_counts.items()):
    print(f"  • {domain:<15} → {count} scenarios")
print(f"\n📁 Total categories: {len(domain_counts)}")
print(f"🧩 Total scenarios:  {sum(domain_counts.values())}")



✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/

📊 Summary by category:

  • admin           → 50 scenarios
  • analytics       → 31 scenarios
  • auth            → 130 scenarios
  • billing         → 163 scenarios
  • content         → 362 scenarios
  • integration     → 20 scenarios
  • misc            → 158 scenarios
  • notifications   → 19 scenarios
  • onboarding      → 52 scenarios
  • security        → 21 scenarios
  • team            → 165 scenarios
  • workflow        → 20 scenarios

📁 Total categories: 12
🧩 Total scenarios:  1191




# Inferences

## Option 1: Split After Generation (Keyword-Based)

**How it works:**

* Generate all BDDs into one JSON file.
* Use keyword matching (e.g., `"login" → auth`, `"payment" → billing`) to split into logical JSON files.

**Pros:**

* Fast and simple implementation
* No extra API calls (cost-efficient)
* Deterministic output (no randomness)
* Easy to debug and adjust
* Low latency, good for bulk PRD processing

**Cons:**

* Requires manual keyword maintenance
* May misclassify complex sentences
* No real semantic understanding

<br>

---

<br>

## Option 2: Modify JSON and Ask LLM to Classify (LLM-Assisted)

**How it works:**

* Ask the LLM to include a `"domain"` field along with each scenario (`Given`, `When`, `Then`).

**Example output:**

```json
{
  "given": "A user has valid credentials",
  "when": "They attempt to log in",
  "then": "The system authenticates them",
  "domain": "auth"
}
```

**Pros:**

* Understands semantic meaning beyond keywords
* Adapts as domains evolve
* Simplifies post-processing (group by domain directly)

**Cons:**

* Higher API token cost
* Slightly slower generation
* May produce inconsistent domain labels
* May fill the context window faster when we explicitly mention the domains in the prompt

<br>

---

<br>

## Which Approach Is Better?

**If prototyping or building early pipeline:**

* Keyword-based splitting is better
* Easier to tune and debug
* Fast, predictable, and cheap
* “Domain” field can be added later

**If optimizing for production automation:**

* LLM-based tagging is better
* More flexible and semantically accurate
* Scales across complex or ambiguous PRDs

<br>

---

<br>

## Hybrid Approach

Use both approaches together:

```python
if "domain" in scenario and scenario["domain"]:
    domain = normalize_domain(scenario["domain"])
else:
    domain = detect_domain_using_keywords(scenario)
```

* Combines LLM’s semantic power with keyword fallback
* Balances accuracy, cost, and stability

<br>

---

<br>


## Summary

* **For rapid prototyping:** use keyword-based post-split
* **For production-level accuracy:** use LLM-generated `domain`
* **Best overall:** hybrid approach (LLM + keyword fallback)

