In [None]:
# ==========================================================
# PRD to BDD JSON Converter (using OpenAI GPT)
# Author: Arjun M S
# Purpose: Automatically extract BDD scenarios (Given/When/Then)
#          from Product Requirements Documents using LLMs
# ==========================================================

In [None]:
!pip install sentence-transformers scikit-learn
!pip install google-genai python-docx PyPDF2

In [None]:
import docx
import json
import re
# import openai
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Import Gemini / GenAI SDK
from google import genai
from google.genai import types


from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')


client = genai.Client(api_key=GEMINI_API_KEY)


import PyPDF2
from pathlib import Path
from textwrap import shorten

In [None]:
# Read PRD File
def read_document(file_path):
    """
    Extracts text from a .docx or .pdf PRD file.
    """
    file_extension = Path(file_path).suffix.lower()

    if file_extension == ".docx":
        doc = docx.Document(file_path)
        text = "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
    elif file_extension == ".pdf":
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n"
    else:
        raise ValueError(f"Unsupported file type: {file_extension}. Please provide a .docx or .pdf file.")

    return text

In [None]:
# Split Large Documents into Manageable Chunks (using simple character count)
def chunk_text(text, max_length=4000):
    """Split long text into smaller chunks for API processing."""
    sentences = re.split(r'(?<=[.!?])\s+', text) # look behind for a punctuation mark like ., !, or ?
    chunks, chunk = [], ""
    for s in sentences:
        if len(chunk) + len(s) < max_length: # Add sentence to current chunk (if not too long)
            chunk += " " + s
        else: # If it would exceed the limit, save the chunk
            chunks.append(chunk.strip())
            chunk = s
    if chunk: # Add the last leftover chunk (After the loop, if there’s any text left unsaved, add it to the list.)
        chunks.append(chunk.strip())
    return chunks



In [None]:
# Uses Gemini to extract Given/When/Then scenarios from text chunk.
def extract_bdd_from_chunk(chunk):

    prompt = f"""
You are a software analyst. Convert the following PRD section into a structured JSON of BDD (Behavior Driven Development) scenarios.

Each scenario should be in the format:
{{
  "given": "...",
  "when": "...",
  "then": "..."
}}

If multiple features or behaviors exist, create multiple scenarios.
Keep the output strictly valid JSON (no commentary, no markdown).

Text:
{chunk}
    """

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(response_mime_type="application/json")
    )

    # print("=== TEXT OUT RESPONSE ===")
    try:
        text_out = response.candidates[0].content.parts[0].text.strip()
    except Exception:
        text_out = response.text or ""
    # print(text_out)
    # print("====================")

    # Try to parse structured output
    parsed = getattr(response, "parsed", None)
    if parsed:
        return parsed

    # If parsed is empty, use text_out fallback
    if text_out:
        cleaned = text_out.strip().strip("```json").strip("```")
        print("=== CLEANED RESPONSE ===")
        print(cleaned)
        print("====================")
        print()
        try:
            return json.loads(cleaned)
        except Exception as e:
            print("⚠️ JSON parse failed:", e)
            return {"error": "Invalid JSON", "raw_output": cleaned[:300]}
    else:
        return {"error": "Empty response"}



In [None]:
# Combine All Scenarios
def prd_to_bdd_json(file_path):
    text = read_document(file_path)
    chunks = chunk_text(text)

    print(f"Processing {len(chunks)} chunks...")

    all_features = []
    for i, chunk in enumerate(chunks, start=1):
        print(f"🔹 Analyzing chunk {i}/{len(chunks)}...")
        result = extract_bdd_from_chunk(chunk)

        if result is None:
            print(f"⚠️ Chunk {i} returned None — skipping")
            continue

        # to handle or normalize different possible output formats from the LLM
        # LLMs (like Gemini or GPT) don’t always respond exactly the same way
        if isinstance(result, dict) and "features" in result: # if the result is a dictionary and features is a key in that dictionary
            all_features.extend(result["features"])
        elif isinstance(result, list): # if the result is a List
            all_features.extend(result)
        else:
            all_features.append(result)

    bdd_data = {"features": all_features}
    return bdd_data




In [None]:
# Run Conversion
file_path = "Scribl — Product Requirements Document (PRD).docx"
# file_path = "prd2_macrohard.pdf"

bdd_json = prd_to_bdd_json(file_path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  },
  {
    "given": "A clean or partially initialized PostgreSQL database.",
    "when": "The database schema initialization script is executed.",
    "then": "The 'plan_tier' ENUM type exists and contains the values 'NORMAL', 'PRO', 'ENTERPRISE', ensuring idempotency."
  },
  {
    "given": "A clean or partially initialized PostgreSQL database.",
    "when": "The database schema initialization script is executed.",
    "then": "The 'invoice_status' ENUM type exists and contains the values 'PENDING', 'PAID', 'FAILED', 'VOID', ensuring idempotency."
  },
  {
    "given": "A clean or partially initialized PostgreSQL database.",
    "when": "The database schema initialization script is executed.",
    "then": "The 'notification_channel' ENUM type exists and contains the values 'IN_APP', 'EMAIL', 'SLACK', 'TEAMS', ensuring idempotency."
  },
  {
    "given": "A clean or partially initialized PostgreSQL database.",
    "when

In [None]:
print(bdd_json)
# print()
# print(type(bdd_json))
# print()
print(f"before deduplication bdd_json:=  {len(bdd_json['features'])}")


<class 'dict'>

before deduplication bdd_json:=  1206


# again pass the bdd_json to an LLM with a predefined prompt to check for duplicates and clean it. (LLM Approach)👇

In [None]:
# ------------------------------------------------------
# Use LLM to check for duplicates and clean
# ------------------------------------------------------

# import json
# from google import genai
# from google.genai import types

# def clean_duplicates_with_llm(bdd_json_data, client):
#     """
#     Uses an LLM to identify and remove duplicate BDD scenarios.
#     """
#     prompt = f"""
# You are a software analyst tasked with reviewing BDD scenarios.
# The following JSON contains a list of BDD scenarios. Your goal is to identify and remove any scenarios that are semantically duplicate or very similar to others in the list.
# Consider scenarios duplicates if they describe the same behavior or requirement, even if worded slightly differently.
# Keep only one instance of each unique scenario.
# Maintain the original structure of the JSON, returning a list of unique scenarios under the "features" key.

# Input JSON:
# {json.dumps(bdd_json_data, indent=2, ensure_ascii=False)}

# Output JSON (cleaned, with duplicates removed):
# """

#     response = client.models.generate_content(
#         model="gemini-2.5-flash", # Or another suitable model
#         contents=prompt,
#         config=types.GenerateContentConfig(response_mime_type="application/json")
#     )

#     try:
#         cleaned_json_string = response.candidates[0].content.parts[0].text.strip()
#         # Clean up potential markdown formatting
#         cleaned_json_string = cleaned_json_string.strip("```json").strip("```")
#         cleaned_data = json.loads(cleaned_json_string)
#         return cleaned_data
#     except Exception as e:
#         print(f"⚠️ LLM cleanup failed: {e}")
#         # Fallback to returning original data or handle error appropriately
#         return bdd_json_data

# # Example usage (assuming 'client' is already defined from previous cells):
# cleaned_bdd_json = clean_duplicates_with_llm(bdd_json, client)
# print(f"\n✅ Scenarios after LLM cleanup: {len(cleaned_bdd_json['features'])}")

⚠️ LLM cleanup failed: Unterminated string starting at: line 224 column 16 (char 10881)

✅ Scenarios after LLM cleanup: 1206


# ------------- NEW CODE STARTS HERE ⬇️------------------------------

In [None]:
# ======================================================
# 🔍 BDD Step Semantic Similarity + NLI Relationship Check
# ======================================================

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import itertools
import torch



In [None]:
# ------------------------------------------------------
# 1. Extract BDD step text from your structure
# ------------------------------------------------------
bdd_features = bdd_json.get("features", [])

# Flatten all Given/When/Then steps into simple text lines (compare individual clauses)
bdd_steps = []
for feature in bdd_features:
    for key in ["given", "when", "then"]:
        text = feature.get(key) # Extracts individual Given/When/Then steps as separate text lines
        if text:
            bdd_steps.append(f"{key.capitalize()}: {text}")

# combine into one sentence (less accurate)  (maybe due to weights of given when then)
# bdd_steps = []
# for feature in bdd_features:
#     step_text = f"Given {feature.get('given', '')} When {feature.get('when', '')} Then {feature.get('then', '')}".strip()
#     bdd_steps.append(step_text)

print(f"✅ Extracted {len(bdd_steps)} total BDD steps for comparison.\n")



✅ Extracted 2217 total BDD steps for comparison.



In [None]:
print(bdd_features)



In [None]:
print(bdd_steps)



In [None]:
# ------------------------------------------------------
# 2. Load models (with GPU support if available)
# ------------------------------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}\n")

embedder = SentenceTransformer('all-mpnet-base-v2', device=device) # For generating semantic embeddings
nli_model = pipeline("text-classification", model="roberta-large-mnli", device=0 if device=='cuda' else -1) # For Natural Language Inference  (classifies as entailment/contradiction/neutral)



Using device: cpu



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# ------------------------------------------------------
# 3. Generate all unique pairs of BDD steps
# This is O(n²) - could be very slow for large documents
# ------------------------------------------------------
pairs = list(itertools.combinations(bdd_steps, 2))
print(f"Total pairs to compare: {len(pairs)}")



Total pairs to compare: 2456436


In [None]:
from tqdm.notebook import tqdm

# ------------------------------------------------------
# 4. Optimized: Compute similarity and NLI relationship
"""
This code compares every possible pair of BDD steps by computing cosine similarity between their sentence embeddings and,
when similarity is high, uses a Natural Language Inference (NLI) model to classify their relationship (e.g., entailment, contradiction, neutral),
then records the result (similarity score, NLI label, confidence, and decision) for each pair.
"""
# ------------------------------------------------------

# Cache embeddings once
print("Encoding all steps once for efficiency...")
embeddings = embedder.encode(bdd_steps, convert_to_tensor=True)
print("✅ Embeddings ready.\n")

results = []

# Precompute all cosine similarities
cosine_matrix = util.pytorch_cos_sim(embeddings, embeddings)

# Loop through pairs with progress bar
for i, (s1, s2) in enumerate(tqdm(pairs, desc="Comparing pairs")):
    idx1 = bdd_steps.index(s1)
    idx2 = bdd_steps.index(s2)
    sim_score = cosine_matrix[idx1][idx2].item()

    # Only run NLI if sentences are somewhat similar (saves time)
    # Only runs expensive NLI model if similarity > 0.6
    if sim_score > 0.6:
        nli_input = s1 + " </s> " + s2
        nli_result = nli_model(nli_input)[0]
        label, conf = nli_result['label'], nli_result['score']
    else:
        label, conf = "NEUTRAL", 1.0  # Skip unnecessary NLI calls

    # Decision logic
    if sim_score > 0.8:
        if label == 'ENTAILMENT':
            decision = "✅ Likely Duplicate" # High similarity (>0.8) + entailment = duplicate
        elif label == 'CONTRADICTION':
            decision = "❌ Contradictory" # High similarity + contradiction = conflicting requirements
        else:
            decision = "⚠️ Similar but Unclear" # High similarity + neutral = needs review
    else:
        decision = "❌ Not Similar" # Low similarity = different scenarios

    results.append({
        'Step 1': s1,
        'Step 2': s2,
        'Similarity': round(sim_score, 4),
        'NLI Label': label,
        'Confidence': round(conf * 100, 2),
        'Decision': decision
    })


Encoding all steps once for efficiency...
✅ Embeddings ready.



Comparing pairs:   0%|          | 0/2456436 [00:00<?, ?it/s]

In [None]:
# ------------------------------------------------------
# 5. Create and display results table
# ------------------------------------------------------
df_results = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)  # so full steps are visible
display(df_results)



Unnamed: 0,Step 1,Step 2,Similarity,NLI Label,Confidence,Decision
0,Given: A user is not authenticated with Scribl,When: The user attempts to sign in via LinkedIn OAuth,0.3181,NEUTRAL,100.00,❌ Not Similar
1,Given: A user is not authenticated with Scribl,"Then: The system retrieves the user's LinkedIn profile data (name, title, connections, profile picture), and the system creates a Scribl user record",0.5064,NEUTRAL,100.00,❌ Not Similar
2,Given: A user is not authenticated with Scribl,Given: A new user has successfully authenticated with Scribl,0.8241,CONTRADICTION,99.25,❌ Contradictory
3,Given: A user is not authenticated with Scribl,"When: The user starts the multi-step onboarding flow and provides goals, brand voice, posting frequency, and niche selections",0.0841,NEUTRAL,100.00,❌ Not Similar
4,Given: A user is not authenticated with Scribl,"Then: The system saves the user's progress automatically, and the user's preferences are configured",0.0481,NEUTRAL,100.00,❌ Not Similar
...,...,...,...,...,...,...
2456431,"Then: A complete audit trail of who approved/rejected, when, and any associated comments should be maintained",When: Conditional approvals are configured,0.3769,NEUTRAL,100.00,❌ Not Similar
2456432,"Then: A complete audit trail of who approved/rejected, when, and any associated comments should be maintained","Then: The workflow should adjust its path or require additional approvals based on specific criteria within the request (e.g., amount, department)",0.4269,NEUTRAL,100.00,❌ Not Similar
2456433,Given: An approval workflow involves dynamic decision-making,When: Conditional approvals are configured,0.4849,NEUTRAL,100.00,❌ Not Similar
2456434,Given: An approval workflow involves dynamic decision-making,"Then: The workflow should adjust its path or require additional approvals based on specific criteria within the request (e.g., amount, department)",0.5097,NEUTRAL,100.00,❌ Not Similar


In [None]:
# Calculate and print the requested metrics
total_count = len(bdd_steps) # Total number of individual steps
duplicate_count = df_results[df_results['Decision'] == '✅ Likely Duplicate'].shape[0] # Count of pairs marked as likely duplicates
contradicting_count = df_results[df_results['Decision'] == '❌ Contradictory'].shape[0] # Count of pairs marked as contradictory

# "After Cleanup" is the number of unique scenarios, which was already calculated and printed in the remove_duplicates function
scenarios_after_cleanup = len(bdd_json['features'])


print(f"\n--- Summary of BDD Step Analysis ---")
print(f"Total individual steps analyzed: {total_count}")
print(f"Pairs identified as Likely Duplicates: {duplicate_count}")
print(f"Pairs identified as Contradictory: {contradicting_count}")
print(f"Number of unique scenarios after deduplication: {scenarios_after_cleanup}")
print(f"------------------------------------")

Unnamed: 0,Step 1,Step 2,Similarity,NLI Label,Confidence,Decision
0,Given: A user is not authenticated with Scribl,When: The user attempts to sign in via LinkedIn OAuth,0.3181,NEUTRAL,100.00,❌ Not Similar
1,Given: A user is not authenticated with Scribl,"Then: The system retrieves the user's LinkedIn profile data (name, title, connections, profile picture), and the system creates a Scribl user record",0.5064,NEUTRAL,100.00,❌ Not Similar
2,Given: A user is not authenticated with Scribl,Given: A new user has successfully authenticated with Scribl,0.8241,CONTRADICTION,99.25,❌ Contradictory
3,Given: A user is not authenticated with Scribl,"When: The user starts the multi-step onboarding flow and provides goals, brand voice, posting frequency, and niche selections",0.0841,NEUTRAL,100.00,❌ Not Similar
4,Given: A user is not authenticated with Scribl,"Then: The system saves the user's progress automatically, and the user's preferences are configured",0.0481,NEUTRAL,100.00,❌ Not Similar
...,...,...,...,...,...,...
2456431,"Then: A complete audit trail of who approved/rejected, when, and any associated comments should be maintained",When: Conditional approvals are configured,0.3769,NEUTRAL,100.00,❌ Not Similar
2456432,"Then: A complete audit trail of who approved/rejected, when, and any associated comments should be maintained","Then: The workflow should adjust its path or require additional approvals based on specific criteria within the request (e.g., amount, department)",0.4269,NEUTRAL,100.00,❌ Not Similar
2456433,Given: An approval workflow involves dynamic decision-making,When: Conditional approvals are configured,0.4849,NEUTRAL,100.00,❌ Not Similar
2456434,Given: An approval workflow involves dynamic decision-making,"Then: The workflow should adjust its path or require additional approvals based on specific criteria within the request (e.g., amount, department)",0.5097,NEUTRAL,100.00,❌ Not Similar



--- Summary of BDD Step Analysis ---
Total individual steps analyzed: 2217
Pairs identified as Likely Duplicates: 174
Pairs identified as Contradictory: 46
Number of unique scenarios after deduplication: 740
------------------------------------


In [None]:
# ------------------------------------------------------
# 6. Save to CSV
# ------------------------------------------------------
output_path = "macro_hard_full_sentence_comparison_results.csv"
df_results.to_csv(output_path, index=False)
print(f"\n✅ Results saved to '{output_path}'")


✅ Results saved to 'macro_hard_full_sentence_comparison_results.csv'


# ------------- NEW CODE ENDS HERE ⬆️------------------------------

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

#=========== SAVE DUPLICATES TO A CSV for later inspection ============
# COSINE SIMILARITY
def remove_duplicates(features, threshold=0.9, show_duplicates=True):
    """Remove semantically similar BDD scenarios using cosine similarity and show duplicates."""

    # model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('intfloat/e5-large-v2')

    texts = [
        f"Given {f.get('given', '')} When {f.get('when', '')} Then {f.get('then', '')}"
        for f in features
    ]

    # Extract When and Then texts separately for contrast check
    whens = [f.get('when', '') for f in features]
    thens = [f.get('then', '') for f in features]


    embeddings = model.encode(texts)
    sim_matrix = cosine_similarity(embeddings)

    seen = set()
    unique_indices = []
    duplicates = []  # store tuples of (original_idx, duplicate_idx, similarity)

    # Define contradictory keyword pairs
    contrast_pairs = [
        ("success", "error"), ("approve", "reject"),
        ("completed", "failed"), ("allow", "deny"),
        ("green", "red"), ("enabled", "disabled"),
        ("true", "false")
    ]

    def has_contrast(text1, text2):
        t1, t2 = text1.lower(), text2.lower()
        for a, b in contrast_pairs:
            if (a in t1 and b in t2) or (b in t1 and a in t2):
                return True
        return False


    for i in range(len(features)):
        if i in seen:
            continue
        for j in range(i + 1, len(features)):
            # Check for both similarity and lack of contrast
            if sim_matrix[i, j] > threshold and not has_contrast(
                f"{whens[i]} {thens[i]}", f"{whens[j]} {thens[j]}"
            ):
                seen.add(j)
                duplicates.append((i, j, sim_matrix[i, j]))
        unique_indices.append(i)

    removed_count = len(features) - len(unique_indices)
    print(f"\n🧹 Before cleanup: {len(features)} scenarios\n")
    print(f"❌🗑️ Removed {removed_count} DUPLICATE SCENARIOS.\n")
    print(f"✅ After cleanup: {len(unique_indices)} scenarios\n")

    if show_duplicates and duplicates:
        print("🔍 Duplicate scenario pairs (showing top 10 by similarity):\n")
        # Sort duplicates by similarity descending
        duplicates = sorted(duplicates, key=lambda x: x[2], reverse=True)

        for i, (a, b, score) in enumerate(duplicates[:10]):
            print(f"\n🧩 Similarity: {score:.3f}")
            print(f"🅰️ Scenario A: {texts[a][:300]}")
            print(f"🅱️ Scenario B: {texts[b][:300]}")
            print("-" * 80)

    # Optional: save to CSV for later inspection
    if show_duplicates and duplicates:
        dup_data = [
            {"original_index": a, "duplicate_index": b, "similarity": score,
             "scenario_A": texts[a], "scenario_B": texts[b]}
            for a, b, score in duplicates
        ]
        pd.DataFrame(dup_data).to_csv("duplicates_report.csv", index=False, encoding="utf-8")
        print("\n📁 Detailed duplicate report saved → duplicates_report.csv")

    return [features[i] for i in unique_indices]

In [None]:
# 🔍 Remove near-duplicate scenarios

print(f"🧹 Before cleanup: {len(bdd_json['features'])} scenarios")

# --- Save before deduplication ---
raw_output_path = Path("bdd_output_gemini_raw.json")
with open(raw_output_path, "w", encoding="utf-8") as f:
    json.dump(bdd_json, f, indent=2, ensure_ascii=False)
print(f"\n📁 Saved original (before deduplication): {raw_output_path.resolve()}")




bdd_json["features"] = remove_duplicates(bdd_json["features"], threshold=0.9)





print(f"\n\n✅ After cleanup: {len(bdd_json['features'])} scenarios")

deduped_output_path = Path("bdd_output_gemini_duplicates_removed.json")
with open(deduped_output_path, "w", encoding="utf-8") as f:
    json.dump(bdd_json, f, indent=2, ensure_ascii=False)
print(f"\n📁 Saved cleaned (after deduplication): {deduped_output_path.resolve()}")

print(f"\n✅ BDD JSON created: {deduped_output_path.resolve()}")


🧹 Before cleanup: 1375 scenarios

📁 Saved original (before deduplication): /content/bdd_output_gemini_raw.json


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]


🧹 Before cleanup: 1375 scenarios

❌🗑️ Removed 635 DUPLICATE SCENARIOS.

✅ After cleanup: 740 scenarios

🔍 Duplicate scenario pairs (showing top 10 by similarity):


🧩 Similarity: 0.995
🅰️ Scenario A: Given An existing 'User' and a specified amount of 'creditsTotal'. When A request is made to allocate AI credits to the 'userId'. Then An 'AiCreditAllocation' record should be created for the user, setting 'creditsTotal' and 'creditsRemaining' to the allocated amount, with 'allocatedAt' timestamped.
🅱️ Scenario B: Given An existing 'Team' and a specified amount of 'creditsTotal'. When A request is made to allocate AI credits to the 'teamId'. Then An 'AiCreditAllocation' record should be created for the team, setting 'creditsTotal' and 'creditsRemaining' to the allocated amount, with 'allocatedAt' timestamped.
--------------------------------------------------------------------------------

🧩 Similarity: 0.984
🅰️ Scenario A: Given A user has authorized the application on LinkedIn and is ne

In [None]:
# # Save Final JSON
# output_path = Path("bdd_output_gemini_duplicates_removed.json")
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(bdd_json, f, indent=2, ensure_ascii=False) # ensure_asci=False ; don’t convert non-English characters into escape codes. Keep them readable as they are.

# print(f"\n✅ BDD JSON created: {output_path.resolve()}")

# Outputting the Results

In [None]:
import json
from google.colab import files

# Suppose your file name is "bdd_output_gemini.json"
file_name = "bdd_output_gemini.json"

# Read & parse JSON
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Pretty print the JSON (indentation)
print(json.dumps(data, indent=2, ensure_ascii=False))

{
  "features": [
    {
      "given": "A user wants to access Scribl",
      "when": "The user authenticates through LinkedIn OAuth",
      "then": "The system retrieves their LinkedIn profile data (name, title, connections, profile picture) and creates a Scribl user record"
    },
    {
      "given": "A new user has successfully authenticated with Scribl",
      "when": "The user proceeds through the guided, multi-step onboarding process",
      "then": "The system collects user goals, brand voice preferences, posting frequency, and niche selections, and auto-saves their progress"
    },
    {
      "given": "A Team Admin is setting up Scribl for their team",
      "when": "The Admin selects the number of seats and a subscription plan",
      "then": "A real-time calculator displays pricing and features, and the system processes payment via Stripe before provisioning access"
    },
    {
      "given": "A user is in the content creation interface",
      "when": "The user inputs an 

# SPLIT into different Categories


In [None]:
import os
import json
from pathlib import Path

# Load the generated master JSON
with open("bdd_output_gemini.json", "r", encoding="utf-8") as f:
    bdd_json = json.load(f)

# Create an output folder for the split files
output_dir = Path("bdd_output_split")
output_dir.mkdir(exist_ok=True)

# Define keywords to detect each domain
domain_map = {
    "auth": ["login", "signup", "password", "mfa", "authenticate", "logout"],
    "onboarding": ["onboard", "setup", "profile", "introduction"],
    "billing": ["payment", "invoice", "subscription", "refund", "billing", "checkout"],
    "content": ["post", "article", "draft", "generate", "ai", "editor", "caption"],
    "analytics": ["dashboard", "metrics", "insights", "report", "tracking"],
    "team": ["team", "workspace", "member", "invite"],
    "admin": ["admin", "role", "permission", "configuration", "superadmin"],
    "notifications": ["notification", "email", "message", "alert"],
    "workflow": ["workflow", "approval", "task", "automation"],
    "integration": ["api", "webhook", "integration", "connector"],
    "security": ["compliance", "encryption", "access", "tls", "policy"]
}

def detect_domain(scenario):
    """
    Detect which domain a scenario belongs to based on keyword matching.
    """
    text = (
        scenario.get("given", "") + " " +
        scenario.get("when", "") + " " +
        scenario.get("then", "")
    ).lower()

    # Less pythonic, more explicit
    for domain, keywords in domain_map.items():
        for keyword in keywords:
            if keyword in text:
                return domain

    return "misc"  # fallback if no match

# Keep a counter of how many scenarios per domain
domain_counts = {}

# Split and save scenarios by domain
for scenario in bdd_json.get("features", []):
    domain = detect_domain(scenario)
    file_path = output_dir / f"{domain}.json"

    # Load existing data if file already exists
    if file_path.exists():
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {"features": []}

    # Add scenario and save back
    data["features"].append(scenario)
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    # Count how many scenarios per domain
    if domain in domain_counts:
        domain_counts[domain] += 1
    else:
        domain_counts[domain] = 1

# === Summary Printout ===
print("\n✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/\n")
print("📊 Summary by category:\n")
for domain, count in sorted(domain_counts.items()):
    print(f"  • {domain:<15} → {count} scenarios")
print(f"\n📁 Total categories: {len(domain_counts)}")
print(f"🧩 Total scenarios:  {sum(domain_counts.values())}")



✅ BDD scenarios split into domain-based JSON files inside /bdd_output_split/

📊 Summary by category:

  • admin           → 61 scenarios
  • analytics       → 30 scenarios
  • auth            → 96 scenarios
  • billing         → 148 scenarios
  • content         → 372 scenarios
  • integration     → 25 scenarios
  • misc            → 115 scenarios
  • notifications   → 15 scenarios
  • onboarding      → 51 scenarios
  • security        → 27 scenarios
  • team            → 167 scenarios
  • workflow        → 21 scenarios

📁 Total categories: 12
🧩 Total scenarios:  1128




# Inferences

## Option 1: Split After Generation (Keyword-Based)

**How it works:**

* Generate all BDDs into one JSON file.
* Use keyword matching (e.g., `"login" → auth`, `"payment" → billing`) to split into logical JSON files.

**Pros:**

* Fast and simple implementation
* No extra API calls (cost-efficient)
* Deterministic output (no randomness)
* Easy to debug and adjust
* Low latency, good for bulk PRD processing

**Cons:**

* Requires manual keyword maintenance
* May misclassify complex sentences
* No real semantic understanding

<br>

---

<br>

## Option 2: Modify JSON and Ask LLM to Classify (LLM-Assisted)

**How it works:**

* Ask the LLM to include a `"domain"` field along with each scenario (`Given`, `When`, `Then`).

**Example output:**

```json
{
  "given": "A user has valid credentials",
  "when": "They attempt to log in",
  "then": "The system authenticates them",
  "domain": "auth"
}
```

**Pros:**

* Understands semantic meaning beyond keywords
* Adapts as domains evolve
* Simplifies post-processing (group by domain directly)

**Cons:**

* Higher API token cost
* Slightly slower generation
* May produce inconsistent domain labels
* May fill the context window faster when we explicitly mention the domains in the prompt

<br>

---

<br>

## Which Approach Is Better?

**If prototyping or building early pipeline:**

* Keyword-based splitting is better
* Easier to tune and debug
* Fast, predictable, and cheap
* “Domain” field can be added later

**If optimizing for production automation:**

* LLM-based tagging is better
* More flexible and semantically accurate
* Scales across complex or ambiguous PRDs

<br>

---

<br>

## Hybrid Approach

Use both approaches together:

```python
if "domain" in scenario and scenario["domain"]:
    domain = normalize_domain(scenario["domain"])
else:
    domain = detect_domain_using_keywords(scenario)
```

* Combines LLM’s semantic power with keyword fallback
* Balances accuracy, cost, and stability

<br>

---

<br>


## Summary

* **For rapid prototyping:** use keyword-based post-split
* **For production-level accuracy:** use LLM-generated `domain`
* **Best overall:** hybrid approach (LLM + keyword fallback)

