In [1]:
# ==========================================================
# PRD to BDD JSON Converter (using OpenAI GPT)
# Author: Arjun M S
# Purpose: Automatically extract BDD scenarios (Given/When/Then)
#          from Product Requirements Documents using LLMs
# ==========================================================

In [1]:

!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m121.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, python-docx
Successfully installed lxml-6.0.2 python-docx-1.2.0


In [2]:
import docx
import json
import re
# import openai
from pathlib import Path
from textwrap import shorten


In [3]:
!pip install sentence-transformers scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.6/486.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.1


In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

  * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or


In [5]:
!pip install google-genai



In [6]:
# Import Gemini / GenAI SDK
from google import genai
from google.genai import types

In [20]:
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY_2')


client = genai.Client(api_key=GEMINI_API_KEY)


In [21]:
# Read PRD File
def read_docx(file_path):
    """
    Extracts text from a .docx PRD file.

    Loops through every paragraph in the Word file (for p in doc.paragraphs)
    Takes the text of that paragraph (p.text)
    Removes any extra spaces at the start or end (.strip())
    Keeps only non-empty paragraphs (if p.text.strip())
    """
    doc = docx.Document(file_path)
    text = "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()]) # join all kept paragraph texts with newline separators into one large string.
    return text

In [22]:
# Split Large Documents into Manageable Chunks
def chunk_text(text, max_length=4000):
    """Split long text into smaller chunks for API processing."""
    sentences = re.split(r'(?<=[.!?])\s+', text) # look behind for a punctuation mark like ., !, or ?
    chunks, chunk = [], ""
    for s in sentences:
        if len(chunk) + len(s) < max_length: # Add sentence to current chunk (if not too long)
            chunk += " " + s
        else: # If it would exceed the limit, save the chunk
            chunks.append(chunk.strip())
            chunk = s
    if chunk: # Add the last leftover chunk (After the loop, if there’s any text left unsaved, add it to the list.)
        chunks.append(chunk.strip())
    return chunks



In [23]:
# Uses Gemini to extract Given/When/Then scenarios from text chunk.
def extract_bdd_from_chunk(chunk):

    prompt = f"""
You are a software analyst. Convert the following PRD section into a structured JSON of BDD (Behavior Driven Development) scenarios.

Each scenario should be in the format:
{{
  "given": "...",
  "when": "...",
  "then": "..."
}}

If multiple features or behaviors exist, create multiple scenarios.
Keep the output strictly valid JSON (no commentary, no markdown).

Text:
{chunk}
    """

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(response_mime_type="application/json")
    )

    print("=== TEXT OUT RESPONSE ===")
    try:
        text_out = response.candidates[0].content.parts[0].text.strip()
    except Exception:
        text_out = response.text or ""
    print(text_out)
    print("====================")

    # Try to parse structured output
    parsed = getattr(response, "parsed", None)
    if parsed:
        return parsed

    # If parsed is empty, use text_out fallback
    if text_out:
        cleaned = text_out.strip().strip("```json").strip("```")
        print("=== CLEANED RESPONSE ===")
        print(cleaned)
        print("====================")
        print
        try:
            return json.loads(cleaned)
        except Exception as e:
            print("⚠️ JSON parse failed:", e)
            return {"error": "Invalid JSON", "raw_output": cleaned[:300]}
    else:
        return {"error": "Empty response"}



In [24]:
# Combine All Scenarios
def prd_to_bdd_json(file_path):
    text = read_docx(file_path)
    chunks = chunk_text(text)

    print(f"Processing {len(chunks)} chunks...")

    all_features = []
    for i, chunk in enumerate(chunks, start=1):
        print(f"🔹 Analyzing chunk {i}/{len(chunks)}...")
        result = extract_bdd_from_chunk(chunk)

        if result is None:
            print(f"⚠️ Chunk {i} returned None — skipping")
            continue

        # to handle or normalize different possible output formats from the LLM
        # LLMs (like Gemini or GPT) don’t always respond exactly the same way
        if isinstance(result, dict) and "features" in result: # if the result is a dictionary and features is a key in that dictionary
            all_features.extend(result["features"])
        elif isinstance(result, list): # if the result is a List
            all_features.extend(result)
        else:
            all_features.append(result)

    bdd_data = {"features": all_features}
    return bdd_data




In [25]:
# Run Conversion
file_path = "Scribl — Product Requirements Document (PRD).docx"
bdd_json = prd_to_bdd_json(file_path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    "then": "A CSV file containing the displayed audit entries is downloaded"
  },
  {
    "given": "The Audit Viewer UI is displayed with a list of audit entries",
    "when": "The administrator selects an individual audit entry",
    "then": "A detail panel appears, showing metadata for the entry and a link to the associated entity (if applicable)"
  },
  {
    "given": "The system is configured with seat-counting policies",
    "when": "Team members are assigned roles",
    "then": "Roles such as OWNER, ADMIN, and EDITOR are counted towards seat usage"
  },
  {
    "given": "The system is configured with seat-counting policies",
    "when": "A team member is assigned the 'Viewer' role",
    "then": "The system allows flexibility to configure whether 'Viewer' roles consume seats or are free"
  },
  {
    "given": "A team has N available seats, and the current 'seatsUsed + pendingInvites + newInvites' is less than or equ

In [26]:
# def remove_duplicates(features, threshold=0.9):
#     """Remove semantically similar BDD scenarios using cosine similarity."""
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     texts = [
#         f"Given {f.get('given', '')} When {f.get('when', '')} Then {f.get('then', '')}"
#         for f in features
#     ]

#     embeddings = model.encode(texts)
#     sim_matrix = cosine_similarity(embeddings)

#     seen = set()
#     unique_indices = []

#     for i in range(len(features)):
#         if i in seen:
#             continue
#         for j in range(i + 1, len(features)):
#             if sim_matrix[i, j] > threshold:
#                 seen.add(j)
#         unique_indices.append(i)
#     print(f"\n\n🧹 Removed {len(features) - len(unique_indices)} duplicate scenarios.")
#     return [features[i] for i in unique_indices]


#=========== SAVE DUPLICATES TO A CSV for later inspection ============

def remove_duplicates(features, threshold=0.9, show_duplicates=True):
    """Remove semantically similar BDD scenarios using cosine similarity and show duplicates."""

    model = SentenceTransformer('all-MiniLM-L6-v2')

    texts = [
        f"Given {f.get('given', '')} When {f.get('when', '')} Then {f.get('then', '')}"
        for f in features
    ]

    embeddings = model.encode(texts)
    sim_matrix = cosine_similarity(embeddings)

    seen = set()
    unique_indices = []
    duplicates = []  # store tuples of (original_idx, duplicate_idx, similarity)

    for i in range(len(features)):
        if i in seen:
            continue
        for j in range(i + 1, len(features)):
            if sim_matrix[i, j] > threshold:
                seen.add(j)
                duplicates.append((i, j, sim_matrix[i, j]))
        unique_indices.append(i)

    removed_count = len(features) - len(unique_indices)
    print(f"\n🧹 Before cleanup: {len(features)} scenarios")
    print(f"🧹 Removed {removed_count} duplicate scenarios.")
    print(f"✅ After cleanup: {len(unique_indices)} scenarios\n")

    if show_duplicates and duplicates:
        print("🔍 Duplicate scenario pairs (showing top 10 by similarity):\n")
        # Sort duplicates by similarity descending
        duplicates = sorted(duplicates, key=lambda x: x[2], reverse=True)

        for i, (a, b, score) in enumerate(duplicates[:10]):
            print(f"\n🧩 Similarity: {score:.3f}")
            print(f"🅰️ Scenario A: {texts[a][:300]}")
            print(f"🅱️ Scenario B: {texts[b][:300]}")
            print("-" * 80)

    # Optional: save to CSV for later inspection
    if show_duplicates and duplicates:
        dup_data = [
            {"original_index": a, "duplicate_index": b, "similarity": score,
             "scenario_A": texts[a], "scenario_B": texts[b]}
            for a, b, score in duplicates
        ]
        pd.DataFrame(dup_data).to_csv("duplicates_report.csv", index=False, encoding="utf-8")
        print("\n📁 Detailed duplicate report saved → duplicates_report.csv")

    return [features[i] for i in unique_indices]


In [27]:
# 🔍 Remove near-duplicate scenarios

print(f"🧹 Before cleanup: {len(bdd_json['features'])} scenarios")

# --- Save before deduplication ---
raw_output_path = Path("bdd_output_gemini_raw.json")
with open(raw_output_path, "w", encoding="utf-8") as f:
    json.dump(bdd_json, f, indent=2, ensure_ascii=False)
print(f"\n📁 Saved original (before deduplication): {raw_output_path.resolve()}")

bdd_json["features"] = remove_duplicates(bdd_json["features"], threshold=0.9)



print(f"\n\n✅ After cleanup: {len(bdd_json['features'])} scenarios")

deduped_output_path = Path("bdd_output_gemini_duplicates_removed.json")
with open(deduped_output_path, "w", encoding="utf-8") as f:
    json.dump(bdd_json, f, indent=2, ensure_ascii=False)
print(f"\n📁 Saved cleaned (after deduplication): {deduped_output_path.resolve()}")

print(f"\n✅ BDD JSON created: {deduped_output_path.resolve()}")


🧹 Before cleanup: 1074 scenarios

📁 Saved original (before deduplication): /content/bdd_output_gemini_raw.json


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🧹 Before cleanup: 1074 scenarios
🧹 Removed 41 duplicate scenarios.
✅ After cleanup: 1033 scenarios

🔍 Duplicate scenario pairs (showing top 10 by similarity):


🧩 Similarity: 0.988
🅰️ Scenario A: Given A 'ContentItem' has been created and is marked as not requiring approval When The 'ContentItem' is scheduled for publication to LinkedIn, and the system processes the scheduled publication Then A 'PublishJob' should be enqueued, a background worker should process the job by calling the LinkedI
🅱️ Scenario B: Given A 'ContentItem' has been created, marked as requiring approval, and has been successfully approved by an authorized user When The approved 'ContentItem' is scheduled for publication to LinkedIn, and the system processes the scheduled publication Then A 'PublishJob' should be enqueued, a backgr
--------------------------------------------------------------------------------

🧩 Similarity: 0.981
🅰️ Scenario A: Given A user's current scope has storage usage below 80% of its quota

In [28]:
# # Save Final JSON
# output_path = Path("bdd_output_gemini_duplicates_removed.json")
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(bdd_json, f, indent=2, ensure_ascii=False) # ensure_asci=False ; don’t convert non-English characters into escape codes. Keep them readable as they are.

# print(f"\n✅ BDD JSON created: {output_path.resolve()}")

# Outputting the Results

In [30]:
import json
from google.colab import files


file_name = "bdd_output_gemini_duplicates_removed.json"

# Read & parse JSON
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Pretty print the JSON (indentation)
print(json.dumps(data, indent=2, ensure_ascii=False))

{
  "features": [
    {
      "given": "A new user accesses the Scribl platform for the first time.",
      "when": "The user attempts to sign up or log in.",
      "then": "The user should be prompted to authenticate via LinkedIn OAuth.",
      "and": "A Scribl user record should be created using the retrieved LinkedIn profile data."
    },
    {
      "given": "A new individual user has successfully authenticated via LinkedIn.",
      "when": "The user starts the onboarding flow.",
      "then": "The user should be guided through a multi-step process.",
      "and": "Progress through the onboarding steps should be auto-saved to prevent drop-offs."
    },
    {
      "given": "A team admin user is setting up their team's Scribl account.",
      "when": "The admin selects the number of seats and a subscription plan.",
      "then": "A real-time calculator should display the pricing and features associated with the selections.",
      "and": "Upon successful payment, access to the platf

# SPLIT into different Categories


In [31]:
import os
import json
from pathlib import Path

# Load the generated master JSON
with open("bdd_output_gemini_duplicates_removed.json", "r", encoding="utf-8") as f:
    bdd_json = json.load(f)

# Create an output folder for the split files
output_dir = Path("bdd_output_split")
output_dir.mkdir(exist_ok=True)

# Define keywords to detect each domain
domain_map = {
    "auth": ["login", "signup", "password", "mfa", "authenticate", "logout"],
    "onboarding": ["onboard", "setup", "profile", "introduction"],
    "billing": ["payment", "invoice", "subscription", "refund", "billing", "checkout"],
    "content": ["post", "article", "draft", "generate", "ai", "editor", "caption"],
    "analytics": ["dashboard", "metrics", "insights", "report", "tracking"],
    "team": ["team", "workspace", "member", "invite"],
    "admin": ["admin", "role", "permission", "configuration", "superadmin"],
    "notifications": ["notification", "email", "message", "alert"],
    "workflow": ["workflow", "approval", "task", "automation"],
    "integration": ["api", "webhook", "integration", "connector"],
    "security": ["compliance", "encryption", "access", "tls", "policy"]
}

def detect_domain(scenario):
    """
    Detect which domain a scenario belongs to based on keyword matching.
    """
    text = (
        scenario.get("given", "") + " " +
        scenario.get("when", "") + " " +
        scenario.get("then", "")
    ).lower()

    # Less pythonic, more explicit
    for domain, keywords in domain_map.items():
        for keyword in keywords:
            if keyword in text:
                return domain

    return "misc"  # fallback if no match

# Keep a counter of how many scenarios per domain
domain_counts = {}

# Split and save scenarios by domain
for scenario in bdd_json.get("features", []):
    domain = detect_domain(scenario)
    file_path = output_dir / f"{domain}.json"

    # Load existing data if file already exists
    if file_path.exists():
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {"features": []}

    # Add scenario and save back
    data["features"].append(scenario)
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    # Count how many scenarios per domain
    if domain in domain_counts:
        domain_counts[domain] += 1
    else:
        domain_counts[domain] = 1

# === Summary Printout ===
print("\n✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/\n")
print("📊 Summary by category:\n")
for domain, count in sorted(domain_counts.items()):
    print(f"  • {domain:<15} → {count} scenarios")
print(f"\n📁 Total categories: {len(domain_counts)}")
print(f"🧩 Total scenarios:  {sum(domain_counts.values())}")



✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/

📊 Summary by category:

  • admin           → 37 scenarios
  • analytics       → 23 scenarios
  • auth            → 95 scenarios
  • billing         → 138 scenarios
  • content         → 342 scenarios
  • integration     → 17 scenarios
  • misc            → 127 scenarios
  • notifications   → 19 scenarios
  • onboarding      → 37 scenarios
  • security        → 31 scenarios
  • team            → 141 scenarios
  • workflow        → 26 scenarios

📁 Total categories: 12
🧩 Total scenarios:  1033




---

---



---
---





# Inferences

## Option 1: Split After Generation (Keyword-Based)

**How it works:**

* Generate all BDDs into one JSON file.
* Use keyword matching (e.g., `"login" → auth`, `"payment" → billing`) to split into logical JSON files.

**Pros:**

* Fast and simple implementation
* No extra API calls (cost-efficient)
* Deterministic output (no randomness)
* Easy to debug and adjust
* Low latency, good for bulk PRD processing

**Cons:**

* Requires manual keyword maintenance
* May misclassify complex sentences
* No real semantic understanding

<br>

---

<br>

## Option 2: Modify JSON and Ask LLM to Classify (LLM-Assisted)

**How it works:**

* Ask the LLM to include a `"domain"` field along with each scenario (`Given`, `When`, `Then`).

**Example output:**

```json
{
  "given": "A user has valid credentials",
  "when": "They attempt to log in",
  "then": "The system authenticates them",
  "domain": "auth"
}
```

**Pros:**

* Understands semantic meaning beyond keywords
* Adapts as domains evolve
* Simplifies post-processing (group by domain directly)

**Cons:**

* Higher API token cost
* Slightly slower generation
* May produce inconsistent domain labels
* May fill the context window faster when we explicitly mention the domains in the prompt

<br>

---

<br>

## Which Approach Is Better?

**If prototyping or building early pipeline:**

* Keyword-based splitting is better
* Easier to tune and debug
* Fast, predictable, and cheap
* “Domain” field can be added later

**If optimizing for production automation:**

* LLM-based tagging is better
* More flexible and semantically accurate
* Scales across complex or ambiguous PRDs

<br>

---

<br>

## Hybrid Approach

Use both approaches together:

```python
if "domain" in scenario and scenario["domain"]:
    domain = normalize_domain(scenario["domain"])
else:
    domain = detect_domain_using_keywords(scenario)
```

* Combines LLM’s semantic power with keyword fallback
* Balances accuracy, cost, and stability

<br>

---

<br>


## Summary

* **For rapid prototyping:** use keyword-based post-split
* **For production-level accuracy:** use LLM-generated `domain`
* **Best overall:** hybrid approach (LLM + keyword fallback)

