<a href="https://colab.research.google.com/github/awarepenguin70/Gherkinizer/blob/master/gherkin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required dependencies
!pip install transformers torch spacy sentence-transformers
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Import all necessary modules
import torch
import spacy
import re  # Import the missing module
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict

In [None]:
class GherkinGenerator:
    def __init__(self):
        self.device = 0 if torch.cuda.is_available() else -1
        self.base_model = "gpt2-medium"  # More powerful than base GPT-2
        self._init_models()
        self._init_nlp_pipelines()

    def _init_models(self):
        """Initialize all required ML models"""
        # Text generation model
        self.generator = pipeline(
            "text-generation",
            model=self.base_model,
            device=self.device,
            max_length=300,
            temperature=0.65,
            top_p=0.9,
            truncation=True,
            num_return_sequences=3  # Generate multiple candidates
        )

        # Semantic similarity model
        self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Quality classifier
        self.quality_tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-MNLI")
        self.quality_model = AutoModelForSequenceClassification.from_pretrained(
            "textattack/bert-base-uncased-MNLI"
        )

    def _init_nlp_pipelines(self):
        """Initialize rule-based NLP components"""
        self.nlp = spacy.load("en_core_web_sm")
        self.gherkin_patterns = {
            'feature': r"Feature:\s*(.+)",
            'scenario': r"Scenario:\s*(.+)",
            'given': r"Given\s*(.+)",
            'when': r"When\s*(.+)",
            'then': r"Then\s*(.+)"
        }

# ------------------
# Algorithm 2: Semantic Validation Engine
# ------------------
    def _validate_semantics(self, user_story: str, generated_text: str) -> float:
        """Calculate semantic similarity score between input and output"""
        embeddings = self.similarity_model.encode([user_story, generated_text])
        return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

    def _check_quality(self, generated_text: str) -> float:
        """Predict quality score using MNLI model"""
        inputs = self.quality_tokenizer(
            generated_text,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        outputs = self.quality_model(**inputs)
        return torch.softmax(outputs.logits, dim=1)[0][0].item()

# ------------------
# Algorithm 3: Structural Analyzer
# ------------------
    def _analyze_structure(self, text: str) -> dict:
        """Parse Gherkin components using rule-based patterns"""
        components = defaultdict(list)
        for line in text.split('\n'):
            for key, pattern in self.gherkin_patterns.items():
                match = re.search(pattern, line.strip())
                if match:
                    components[key].append(match.group(1))
        return components

    def _validate_completeness(self, components: dict) -> bool:
        """Check mandatory Gherkin elements"""
        required = ['feature', 'scenario', 'given', 'when', 'then']
        return all(key in components for key in required)

# ------------------
# Algorithm 4: Complexity Analyzer
# ------------------
    def _analyze_complexity(self, user_story: str) -> int:
        """Calculate story complexity using NLP features"""
        doc = self.nlp(user_story)

        # Complexity heuristics
        num_actors = len([ent for ent in doc.ents if ent.label_ == 'PERSON'])
        num_actions = len([token for token in doc if token.pos_ == 'VERB'])
        sentence_length = len(list(doc.sents))

        return num_actors + num_actions + sentence_length

# ------------------
# Algorithm 5: Generation Pipeline
# ------------------
    def generate_gherkin(self, user_story: str) -> str:
        """Main generation algorithm with quality control"""
        # Step 1: Analyze input complexity
        complexity = self._analyze_complexity(user_story)

        # Step 2: Generate multiple candidates
        prompt = f"""Convert this user story to Gherkin format:

        User Story: {user_story}

        Format:
        Feature: [Clear feature name]
        Scenario: [Specific scenario]
        Given [Initial context]
        When [Triggering action]
        Then [Expected outcome]"""

        candidates = self.generator(prompt, max_length=200 + complexity*10)

        # Step 3: Select best candidate
        best_score = -1
        best_gherkin = ""

        for candidate in candidates:
            text = candidate['generated_text'].split("Format:")[-1].strip()

            # Validation checks
            semantic_score = self._validate_semantics(user_story, text)
            quality_score = self._check_quality(text)
            components = self._analyze_structure(text)
            structure_valid = self._validate_completeness(components)

            total_score = (semantic_score * 0.4 +
                          quality_score * 0.4 +
                          structure_valid * 0.2)

            if total_score > best_score:
                best_score = total_score
                best_gherkin = text

        # Fallback mechanism
        if best_score < 0.6:
            return self._generate_fallback_gherkin(user_story)

        return best_gherkin

# ------------------
# Algorithm 6: Fallback Generation
# ------------------
    def _generate_fallback_gherkin(self, user_story: str) -> str:
        """Rule-based fallback when ML generation fails"""
        doc = self.nlp(user_story)

        # Extract key components
        actors = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
        verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
        objectives = [chunk.text for chunk in doc.noun_chunks if chunk.root.dep_ == 'dobj']

        return f"""Feature: {' '.join(objectives[:1]) or 'System Feature'}
Scenario: {' '.join(verbs[:1])} {' '.join(objectives[:1])}
Given A {' '.join(actors[:1]) or 'user'} exists
When {' '.join(verbs[:1])} action is performed
Then {' '.join(objectives[:1])} should be achieved"""

In [None]:
if __name__ == "__main__":
    generator = GherkinGenerator()

    user_story = "As a registered user, I want to reset my password using email verification so I can regain access when locked out"

    print("Generating Gherkin...")
    result = generator.generate_gherkin(user_story)

    print("\nFinal Gherkin Scenario:")
    print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generating Gherkin...


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


Final Gherkin Scenario:
Feature: my password
Scenario: want my password
Given A user exists
When want action is performed
Then my password should be achieved
