<a href="https://colab.research.google.com/github/abdullah790/Hausa-NLP-Thesis-Framework/blob/main/NLP4RE_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


**Stage 1**: Corpus Builder & Intelligent Preprocessing
Description: This script serves as the Data Acquisition and ETL (Extract, Transform, Load) engine for your framework. It is responsible for constructing the Hausa-Software-Corpus by ingesting raw English software documentation, filtering for relevance, and generating high-quality parallel Hausa data.

Key Technical Functions:

Multi-Format Ingestion: Extracts raw text from PDF
and DOCX files using pdfplumber and python-docx.

Glossary Anchoring: Implements a Glossary Injection mechanism that "locks" specific technical terms (e.g., "API", "Database") before translation to prevent semantic loss.

Synthetic Augmentation: Generates artificial training data using rule-based templates to balance the dataset with Functional Requirements.

Silver-Standard Annotation: Automatically applies BIO Tags (e.g., B-ACTOR, I-SYS) to the generated Hausa text using heuristic pattern matching, preparing the data for Named Entity Recognition (NER) training.

Robust Translation: Orchestrates calls to the Google Translate API to produce the target Hausa text, followed by post-processing normalization (Unicode NFC).

In [None]:
# STAGE 1 & 2: CORPUS BUILDER + INTELLIGENT PREPROCESSING (UPDATED FOR NESTED GLOSSARY)

!pip install -q pdfplumber python-docx transformers sentencepiece sacremoses datasets nltk requests google-cloud-translate
!pip install -q deep-translator
import os
import re
import csv
import json
import random
import unicodedata
import pdfplumber
import docx
import nltk
import requests
import torch
import numpy as np
from difflib import SequenceMatcher
from transformers import pipeline
from google.colab import drive

# CONFIGURATION
drive.mount("/content/drive", force_remount=True)

BASE_INPUT_DIR = "/content/drive/MyDrive/Thesis/my_srs_docs"
BASE_OUTPUT_DIR = "/content/drive/MyDrive/Thesis/my_ha_srs_corpus"
GLOSSARY_PATH = "/content/drive/MyDrive/Thesis/srs_glossary.json"

os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
PARALLEL_CORPUS_CSV = os.path.join(BASE_OUTPUT_DIR, "hausa_srs_parallel_silver.csv")

# API Keys
GOOGLE_API_KEY = "    "#input your google cloud API here
GOOGLE_ENDPOINT = "https://translation.googleapis.com/language/translate/v2"

DEVICE = 0 if torch.cuda.is_available() else -1

# 1. LOAD RESOURCES (NESTED GLOSSARY PARSER)

def load_glossary(path):
    """
    Loads and FLATTENS the nested SRS Glossary.
    Extracts {english_term: hausa_translation} from all categories.
    """
    flat_glossary = {}
    if os.path.exists(path):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Iterate through known categories in your JSON
            categories = [
                "core_concepts", "actors", "modality",
                "functional_actions", "data_entities",
                "quality_attributes", "constraints"
            ]

            for cat in categories:
                if cat in data:
                    for eng_term, details in data[cat].items():
                        # Handle cases where value is a dict (core_concepts) or string (functional_actions)
                        if isinstance(details, dict) and "hausa" in details:
                            ha_term = details["hausa"]
                        elif isinstance(details, str):
                            ha_term = details
                        else:
                            continue

                        flat_glossary[eng_term.lower()] = ha_term

            print(f" Loaded Glossary: {len(flat_glossary)} terms found.")
        except Exception as e:
            print(f" Error loading glossary: {e}")
    else:
        print(f" Glossary file not found at {path}. Using default keywords.")
    return flat_glossary

# Load the Glossary
SRS_GLOSSARY = load_glossary(GLOSSARY_PATH)

# Default Keywords (augmented by Glossary)
SRS_KEYWORDS = {"database", "server", "api", "latency", "bandwidth", "ui", "ux", "admin", "user", "login"}
SRS_KEYWORDS.update(SRS_GLOSSARY.keys())

print("Loading Intelligence Models...")
domain_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=DEVICE)
DOMAIN_LABELS = ["Software Requirements Specification", "General Text","Software Requirements Specification", "SRS",
    "Functional Requirements", "Non-Functional Requirements", "Technical Specification", "System Requirements",
    "Use Case Specification", "Business Requirements", "Product Requirements Document", "User Story Document",
    "Requirements Traceability Matrix", "IEEE 830 SRS", "Requirements Engineering Document"]
tech_ner_classifier = pipeline("ner", model="dslim/distilbert-NER", aggregation_strategy="simple", device=DEVICE)
lang_classifier = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection", device=DEVICE)

# 2. PRE-TRANSLATION INTELLIGENCE (ANCHORING)
def tag_technical_anchors(text):
    # 1. Glossary/Dictionary Lookup
    sorted_keywords = sorted(SRS_KEYWORDS, key=len, reverse=True)

    for term in sorted_keywords:
        pattern = re.compile(r'\b(' + re.escape(term) + r')\b', re.IGNORECASE)
        if pattern.search(text):
             text = pattern.sub(r'<TECH>\1</TECH>', text)

    # 2. DistilBERT NER
    entities = tech_ner_classifier(text)
    for ent in sorted(entities, key=lambda x: x['start'], reverse=True):
        if ent['entity_group'] in ['MISC', 'ORG'] and ent['score'] > 0.85:
            window = text[max(0, ent['start']-6):ent['end']+7]
            if "<TECH>" not in window:
                word = text[ent['start']:ent['end']]
                text = text[:ent['start']] + f"<TECH>{word}</TECH>" + text[ent['end']:]
    return text

def apply_glossary_substitution(text):
    def replace_match(match):
        english_term = match.group(1)
        lower_term = english_term.lower()
        if lower_term in SRS_GLOSSARY:
            return SRS_GLOSSARY[lower_term] # Inject Hausa Translation
        return english_term # Code-Switching

    return re.sub(r'<TECH>(.*?)</TECH>', replace_match, text)

# 3. TEXT CLEANING & NORMALIZATION
def generic_cleaning(text):
    text = re.sub(r'Page \d+ of \d+', '', text, flags=re.I)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'\b(Confidential|Draft|Internal Use Only)\b', '', text, flags=re.I)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_hausa_text(text):
    text = unicodedata.normalize("NFC", text)
    replacements = {'ƙ': 'ƙ', 'ɗ': 'ɗ', 'ɓ': 'ɓ', 'ƴ': 'y', "’": "'", "`": "'"}
    for k, v in replacements.items():
        text = text.replace(k, v)
    text = apply_glossary_substitution(text)
    return text.strip()

# 4. TEMPLATES & BIO TAGGING
# Updated BIO Lookup based on your Glossary content
BIO_LOOKUP = {
    "tsarin": "B-SYS", "manhaja": "B-SYS", "sashin tsari": "B-SYS", "ma’ajiya ta bayanai": "B-SYS",
    "mai amfani": "B-ACTOR", "mai gudanarwa": "B-ACTOR", "mai ruwa da tsaki": "B-ACTOR", "abokin ciniki": "B-ACTOR",
    "bayanai": "B-DATA", "ilimi": "B-DATA", "rikodi": "B-DATA", "rahoto": "B-DATA", "kalmar sirri": "B-DATA",
    "lokacin amsawa": "B-QUAL", "samuwa": "B-QUAL", "tsaro": "B-QUAL", "sauƙin amfani": "B-QUAL",
    "iyaka": "B-CONS", "sharadi": "B-CONS", "ka’ida": "B-CONS"
}

def heuristic_bio_tagger(text):
    tokens = text.split()
    tags = ["O"] * len(tokens)
    for i, token in enumerate(tokens):
        clean_tok = token.strip(".,;:").lower()
        # Check single words
        if clean_tok in BIO_LOOKUP:
            tags[i] = BIO_LOOKUP[clean_tok]
        # Check bigrams (common in your glossary like 'mai amfani')
        elif i > 0:
            bigram = f"{tokens[i-1].strip('.,;:').lower()} {clean_tok}"
            if bigram in BIO_LOOKUP:
                tags[i-1] = BIO_LOOKUP[bigram]
                tags[i] = BIO_LOOKUP[bigram].replace("B-", "I-")
    return " ".join(tags)

def generate_ieee_templates():
    data = []
    # Using terms strictly from your glossary
    actors = [("Mai amfani", "B-ACTOR I-ACTOR"), ("Mai gudanarwa", "B-ACTOR I-ACTOR"), ("Tsarin", "B-SYS")]
    modals = ["zai iya", "dole ne", "ya kamata"]
    actions = ["adana", "turo", "nuna", "sabunta", "goge"]
    objects = [("bayanai", "B-DATA"), ("rahoto", "B-DATA"), ("kalmar sirri", "B-DATA I-DATA")]

    for _ in range(50):
        act_txt, act_tag = random.choice(actors)
        obj_txt, obj_tag = random.choice(objects)
        sent = f"{act_txt} {random.choice(modals)} {random.choice(actions)} {obj_txt}."
        tags = f"{act_tag} O O O {obj_tag} O"
        data.append({"text": sent, "tags": tags, "type": "FR"})
    return data

# 5. TRANSLATION & INGESTION
def google_translate(texts, src, tgt):
    if not texts: return []
    payload = {"q": texts, "source": src, "target": tgt, "key": GOOGLE_API_KEY, "format": "text"}
    try:
        resp = requests.post(GOOGLE_ENDPOINT, data=payload, timeout=20)
        if resp.status_code == 200:
            return [t["translatedText"] for t in resp.json()["data"]["translations"]]
    except: pass
    return ["" for _ in texts]

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    chunks = []
    try:
        if ext == ".pdf":
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    if page.extract_text(): chunks.append(page.extract_text())
        elif ext == ".docx":
            doc = docx.Document(file_path)
            for p in doc.paragraphs:
                if p.text.strip(): chunks.append(p.text)
    except Exception as e: print(f"Error: {e}")
    return chunks

# 6. MAIN PIPELINE
def run_stage_1_and_2():
    RESCUE_KEYWORDS = ["software requirements specification", "SRS", "requirement", "requirements document",
    "requirement id", "use case", "user story", "scenario", "business rule", "assumption", "constraint", "dependency", "stakeholder", "actor", "system",
    "subsystem", "module", "interface", "component", "feature", "function",  "behaviour", "input", "output", "event", "trigger", "precondition",
    "postcondition", "workflow", "process", "state", "status", "transition", "functional requirement", "FR", "the system shall", "the system should", "the system must", "the system will", "the application shall", "the application must",
    "shall allow", "shall enable", "shall support", "shall provide", "shall display", "shall record", "shall log", "shall calculate", "shall validate", "shall notify",
    "shall send", "shall receive", "shall store", "shall retrieve", "shall update", "shall delete", "shall create", "shall generate", "shall export", "shall import",
    "user shall be able to", "must allow", "must enable", "must support", "must provide", "must display", "must record", "must log", "must calculate", "must validate",
    "non-functional requirement", "NFR", "quality requirement", "quality attribute", "performance requirement", "security requirement", "usability requirement",
    "reliability requirement", "availability requirement", "maintainability requirement", "scalability requirement", "portability requirement", "compatibility requirement",
    "response time", "throughput", "latency", "performance", "load", "peak load", "concurrent users", "maximum users", "capacity", "timeout", "caching",
    "authentication", "authorization", "access control", "role-based access control","RBAC", "single sign-on", "SSO", "encryption", "cryptography", "SSL", "TLS",
    "HTTPS", "password policy", "session management", "auditing", "audit trail", "logging", "intrusion", "attack", "vulnerability", "threat", "secure",
    "uptime", "availability", "fault tolerance", "failover", "redundancy", "recovery", "disaster recovery", "backup", "restore", "MTBF", "MTTR",
    "usability", "user experience", "UX", "user-friendly", "intuitive","accessibility", "WCAG", "localization", "internationalization", "i18n", "l10n",
    "shall", "must", "should", "may", "will", "is required to", "is able to", "has to", "needs to", "cannot", "must not", "shall not", "should not",
    "introduction", "scope", "purpose", "definitions", "glossary", "acronyms", "overview", "system overview", "overall description", "specific requirements",
    "functional requirements", "non-functional requirements", "external interface requirements", "system features", "assumptions", "constraints", "appendix",
    "user interface", "UI", "GUI", "screen", "form", "page", "menu", "dialog","button", "field", "input field", "output field", "error message", "notification",
    "API", "endpoint", "REST", "SOAP", "HTTP", "JSON", "XML", "request", "response", "database", "table", "column", "primary key", "foreign key", "transaction",
    "customer", "client", "user", "admin", "operator", "manager", "account","profile", "session", "transaction", "order", "invoice", "payment", "balance",
    "report", "dashboard", "summary", "log", "history", "audit", "change request", "version", "revision", "baseline", "configuration","change control", "impact analysis", "traceability", "requirements traceability matrix",
    "shall", "must", "should", "will", "required", "requirement", "specification", "functional", "non-functional", "constraint", "feature", "module",
    "system", "user", "admin", "client", "server", "database", "interface", "application", "component", "platform", "service", "api", "software",
    "data", "input", "output", "process", "store", "retrieve", "access", "login", "authenticate", "verify", "validate", "generate", "report","security", "performance", "reliability", "availability", "latency",
        "response", "time", "speed", "scale", "maintain", "secure"]

    # --- 1. CHECK PROGRESS (ROBUST VERSION) ---
    processed_ids = set()
    file_exists = os.path.exists(PARALLEL_CORPUS_CSV)

    if file_exists:
        try:
            with open(PARALLEL_CORPUS_CSV, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                # Check if the file is empty or just has a header
                try:
                    first_row = next(reader) # Try reading first row
                    processed_ids.add(first_row['doc_id'])
                    # Read the rest
                    for row in reader:
                        if 'doc_id' in row:
                            processed_ids.add(row['doc_id'])
                except StopIteration:
                    print(" CSV exists but is empty. Will write header.")
                    file_exists = False # Treat as new file
                except KeyError:
                    print(" CSV header mismatch. Starting fresh to avoid corruption.")
                    file_exists = False

            print(f" Resuming... Found {len(processed_ids)} documents already processed.")

        except Exception as e:
            print(f" Error reading progress ({e}). Backup your CSV if needed!")
            # Don't force restart immediately; check if we have IDs loaded
            if not processed_ids:
                print("   Starting fresh.")
                file_exists = False

    # --- 2. SYNTHETIC DATA (Only if not already done) ---
    if "SYNTHETIC_IEEE" not in processed_ids:
        print("Generating IEEE Templates...")
        templates = generate_ieee_templates()
        synth_rows = []
        for t in templates:
            synth_rows.append({
                "doc_id": "SYNTHETIC_IEEE", "src_en": "SYNTHETIC", "tgt_ha": normalize_hausa_text(t['text']),
                "bio_tags": t['tags'], "type": t['type']
            })

        # Save Synthetic Immediately
        keys = synth_rows[0].keys()
        with open(PARALLEL_CORPUS_CSV, 'a' if file_exists else 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            if not file_exists: writer.writeheader()
            writer.writerows(synth_rows)
        print(" Synthetic data saved.")
        file_exists = True # Ensure subsequent writes append
    else:
        print(" Skipping Synthetic Data (Already done).")

    # --- 3. PROCESS REAL FILES ---
    if not os.path.exists(BASE_INPUT_DIR):
        print(f" Input Directory not found: {BASE_INPUT_DIR}")
        return

    all_files = [f for f in os.listdir(BASE_INPUT_DIR) if f.endswith(('.pdf', '.docx'))]
    # Filter out files we have already processed
    files_to_do = [f for f in all_files if f not in processed_ids]

    print(f"Processing {len(files_to_do)} new files (Skipped {len(all_files) - len(files_to_do)})...")

    for filename in files_to_do:
        file_rows = [] # Store rows just for this file

        try:
            raw_chunks = extract_text_from_file(os.path.join(BASE_INPUT_DIR, filename))

            # Step A: Clean
            clean_chunks = []
            for c in raw_chunks:
                cleaned = generic_cleaning(c)
                if len(cleaned) > 20:
                    clean_chunks.append(cleaned)

            if not clean_chunks:
                # Mark as processed even if empty, so we don't retry it forever
                with open(PARALLEL_CORPUS_CSV, 'a', newline='', encoding='utf-8') as f:
                     # Just writing nothing ensures we don't break csv,
                     # but we can't add to processed_ids easily without a row.
                     # We'll just skip for now.
                     pass
                continue

            # Step B: Filter
            # Keyword Filtering (Replaces Domain Classifier)
            '''dom_results = domain_classifier(clean_chunks, candidate_labels=["Technical Documentation", "General Text","Software Requirements Specification",
            "SRS", "Functional Requirements","Non-Functional Requirements", "Technical Specification", "System Requirements", "Use Case Specification", "Business Requirements",
            "Product Requirements Document","User Story Document", "Requirements Traceability Matrix",
            "IEEE 830 SRS", "Requirements Engineering Document","Marketing Content", "Blog Post","News Article", "Email Correspondence","Meeting Minutes",
            "Project Status Report","User Manual","API Documentation","Code Comments","Research Paper","Technical Blog", "Product Description","Legal Document",
            "Financial Report"], batch_size=32, truncation=True)'''
            srs_chunks = []
            for text in clean_chunks:
                # Check if ANY keyword is present in the text
                if any(k in text.lower() for k in RESCUE_KEYWORDS):
                    srs_chunks.append(text)
            if not srs_chunks: continue

            # Step C: Language Check & Translation
            # (We keep Language Check to avoid processing accidentally captured code or non-English text)
            lang_results = lang_classifier(srs_chunks, batch_size=32, truncation=True)
            final_en_chunks = [srs_chunks[i] for i, res in enumerate(lang_results) if res['label'] == 'en']

            # Step C: Translate
            api_batch_size = 8
            for i in range(0, len(final_en_chunks), api_batch_size):
                batch_texts = final_en_chunks[i : i + api_batch_size]
                anchored_batch = [tag_technical_anchors(txt) for txt in batch_texts]
                ha_trans = google_translate(anchored_batch, "en", "ha")

                for src_anchored, ha in zip(anchored_batch, ha_trans):
                    if not ha: continue
                    src_clean = src_anchored.replace("<TECH>", "").replace("</TECH>", "")
                    ha_norm = normalize_hausa_text(ha)
                    silver_tags = heuristic_bio_tagger(ha_norm)

                    file_rows.append({
                        "doc_id": filename,
                        "src_en": src_clean,
                        "tgt_ha": ha_norm,
                        "bio_tags": silver_tags,
                        "type": "translated_anchored"
                    })
            # --- SAVE IMMEDIATELY & SYNC (ROBUST VERSION) ---
            if file_rows:
                with open(PARALLEL_CORPUS_CSV, 'a', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=file_rows[0].keys())
                    writer.writerows(file_rows)

                    # FORCE WRITE TO DISK (Crucial for Colab/Drive)
                    f.flush()
                    os.fsync(f.fileno())

                print(f"   Saved {len(file_rows)} rows from {filename}")

        except Exception as e:
            print(f" Error on {filename}: {e}")
            # Continue to next file so one error doesn't stop the whole script
            continue

    print("\n Batch Processing Complete.")

if __name__ == "__main__":
    run_stage_1_and_2()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
 Loaded Glossary

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Generating IEEE Templates...
 Synthetic data saved.
Processing 309 new files (Skipped 0)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


   Saved 13 rows from Pharmacy Management System.pdf
   Saved 13 rows from requirements.pdf
   Saved 93 rows from CS 250 - Group 10 - Software Requirements.docx
   Saved 93 rows from CS 250 - Group 10 - Software Requirements_1.docx
   Saved 120 rows from SRS group 8.docx
   Saved 120 rows from SRS group 8_1.docx
   Saved 25 rows from TICKET RESERVATION.PDF.docx
   Saved 25 rows from TICKET RESERVATION.PDF_1.docx
   Saved 125 rows from ORMS_AWP_SRS[2].docx
   Saved 125 rows from ORMS_AWP_SRS[2]_1.docx
   Saved 219 rows from SRS DOCUMENTATION v3.0 OF SOCCER LIVE_1.docx
   Saved 219 rows from SRS DOCUMENTATION v3.0 OF SOCCER LIVE.docx
   Saved 150 rows from SRS Doc-Home Loan(2).docx
   Saved 150 rows from SRS Doc-Home Loan(2)_1.docx
   Saved 131 rows from Software Specification Requirments - v 1.2_1.docx
   Saved 131 rows from Software Specification Requirments - v 1.2.docx
   Saved 37 rows from Business_SRS_v2_1.docx
   Saved 37 rows from Business_SRS_v2.docx
   Saved 136 rows from [SP1]



   Saved 20 rows from SDP PROJECT PPT 1.pdf




   Saved 45 rows from SE Diva Drapes SRS.pdf
   Saved 34 rows from Software Requirement and Design Specification for Bank Management System.pdf
   Saved 30 rows from Software Requirements and Design Document.pdf
   Saved 47 rows from Software Requirements Specification Fall 2019.pdf
   Saved 20 rows from SoftwareRequirementSpecification.pdf
   Saved 123 rows from SportsFacilityBooking.pdf
   Saved 29 rows from SRS (1).pdf
   Saved 34 rows from SRS (3).pdf
   Saved 12 rows from SRS (7).pdf
   Saved 22 rows from SRS (10).pdf
   Saved 79 rows from SRS document (1).pdf




   Saved 49 rows from SRS Document for Home Store Project.pdf
   Saved 34 rows from SRS Document For Practo11702545.pdf
   Saved 34 rows from SRS document of BrainBoost (CLMS).pdf
   Saved 17 rows from SRS Documnet for Learn&Joy.pdf
   Saved 26 rows from SRS E-Learning Platform by FA21-BSE-172, 57.pdf




   Saved 221 rows from SRS Final.pdf
   Saved 15 rows from SRS REPORT.pdf
   Saved 23 rows from SRS.1.3.pdf
   Saved 14 rows from SRS_OPP.pdf
   Saved 130 rows from SRS_ZenithZone (Final).pdf
   Saved 48 rows from SRS-BECS-2007 (1).pdf
   Saved 48 rows from SRS-BECS-2007.pdf
   Saved 30 rows from srs-document.pdf
   Saved 20 rows from SRS-for-CAMPUS-RECRUITMENT-SYSTEM.pdf
   Saved 45 rows from TutorFlowSRSDocument.pdf
   Saved 17 rows from UNIVERSITY MANAGEMENT SYPSTEM  .pdf
   Saved 17 rows from (Fatima Nasir Awan 212015 - Samiya Imtiaz 211322)SRS document.pdf
   Saved 1 rows from 1. SRS_Registration_&_Login.pdf
   Saved 10 rows from 19MIS0138_VL2019205004856_DA03.pdf
   Saved 86 rows from 20F-33-SRS.pdf
Error: No /Root object! - Is this really a PDF?
   Saved 18 rows from movies4u_srs.pdf
   Saved 18 rows from Kuliah_SRS Example.pdf
   Saved 1 rows from SEISDL_SRS-Cover.pdf
   Saved 11 rows from IITK_Resources_Semester_4_SRSTemplate-CS253.pdf
Error: No /Root object! - Is this really 



   Saved 768 rows from Courses_SRS Book.pdf
Error: No /Root object! - Is this really a PDF?
Error: No /Root object! - Is this really a PDF?
   Saved 25 rows from SEISDL_SRS_Document.pdf
Error: No /Root object! - Is this really a PDF?
   Saved 26 rows from IITK_Resources_Semester_4_SRSv1.0.pdf
   Saved 5 rows from IITK_Resources_Semester_4_SRS_DOC_13.pdf
   Saved 58 rows from random-pdf_Reporting Standard SRS 101.0 Definitions Clean_0.pdf
   Saved 33 rows from C_Programming_AUTOSAR_SRS_LIN.pdf




   Saved 23 rows from IITK_Resources_Semester_4_srs.pdf




   Saved 94 rows from ticket-agent_Sony_SRS-XG300.pdf
   Saved 2 rows from GunnMaths_C4D2 Practice with SRS 2223.docx.pdf
Error: No /Root object! - Is this really a PDF?
Error: No /Root object! - Is this really a PDF?
   Saved 24 rows from IITK_Resources_Semester_4_SRS-PedalPal-CS253.pdf
Error: No /Root object! - Is this really a PDF?
   Saved 5 rows from random-pdf_20250917-laap-response-to-dbt-srs-cp1-public_1.pdf
Error: No /Root object! - Is this really a PDF?
Error: No /Root object! - Is this really a PDF?
Error: No /Root object! - Is this really a PDF?
   Saved 78 rows from Movie-Recommendation-System-based-on-User-Preferences_SRS_MRS.docx
Error: No /Root object! - Is this really a PDF?
Error: Package not found at '/content/drive/MyDrive/Thesis/my_srs_docs/Group-G_SRS_3_6,7.docx'
Error: No /Root object! - Is this really a PDF?
Error: Package not found at '/content/drive/MyDrive/Thesis/my_srs_docs/Group2-repo-projects_Selenium SRS.docx'
Error: No /Root object! - Is this really a PD

**Script 2: Stage 3 – Silver Corpus Finalisation & Task Datasets**
Description: This script functions as the Data Preparation Layer. It takes the raw, parallel corpus generated in Script 1 and transforms it into the specific file formats required for the three distinct AI models (Translation, Classification, and NER). It ensures that no data leakage occurs between training and testing sets.

Key Technical Functions:

Robust Data Loading: Implements error-handling logic to read the corpus CSV, fixing common issues like missing headers or duplicate rows caused by interrupted runs.

Stratified Splitting: Divides the dataset into Training (70%), Validation (10%), and Testing (20%) sets, ensuring a consistent distribution of data across all tasks.

Multi-Task Formatting: Translation: Exports .jsonl files containing {en, ha} pairs for the NLLB Seq2Seq model.

Classification: Auto-labels rows as FR (Functional) or NFR (Non-Functional) based on glossary keywords and exports them to .csv for the Fusion Classifier.

NER: Aligns tokens with their BIO tags and exports to .jsonl for Named Entity Recognition training.

MLM: Extracts raw Hausa text into a .txt file for domain-adaptive pre-training (Masked Language Modeling).

In [None]:
# STAGE 3: DATASET SPLITTING

import os
import csv
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

BASE_DIR = "/content/drive/MyDrive/Thesis"
INPUT_CORPUS = os.path.join(BASE_DIR, "my_ha_srs_corpus/hausa_srs_parallel_silver.csv")
OUTPUT_DIR = os.path.join(BASE_DIR, "final_datasets")
GLOSSARY_PATH = os.path.join(BASE_DIR, "srs_glossary.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# 1. LOAD GLOSSARY
def get_glossary_keywords():
    nfr_words = []
    fr_words = []
    if os.path.exists(GLOSSARY_PATH):
        with open(GLOSSARY_PATH, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if "quality_attributes" in data:
                nfr_words.extend([v.lower() for v in data["quality_attributes"].values()])
            if "constraints" in data:
                nfr_words.extend([v.lower() for v in data["constraints"].values()])
            if "modality" in data:
                fr_words.extend([v["hausa"].lower() for v in data["modality"].values()])
            if "functional_actions" in data:
                fr_words.extend([v.lower() for v in data["functional_actions"].values()])
    nfr_words += ['tsaro', 'sauri', 'dakika', 'kashi', 'inganci']
    fr_words += ['zai iya', 'dole ne', 'ya kamata', 'adana', 'turo']
    return set(nfr_words), set(fr_words)

NFR_KEYWORDS, FR_KEYWORDS = get_glossary_keywords()
print(f"Loaded {len(NFR_KEYWORDS)} NFR keywords and {len(FR_KEYWORDS)} FR keywords.")

# 2. ROBUST CSV LOADING
if not os.path.exists(INPUT_CORPUS):
    print(" Error: Stage 1 CSV not found.")
    exit()

print(f"Loading corpus from: {INPUT_CORPUS}")

# 1. Try loading normally
df = pd.read_csv(INPUT_CORPUS)

# 2. Check if headers are missing (The Error Fix)
expected_cols = ['src_en', 'tgt_ha']
if not all(col in df.columns for col in expected_cols):
    print(" Warning: Column headers missing. Reloading with manual headers...")
    # Reload with explicit column names
    df = pd.read_csv(INPUT_CORPUS, names=['doc_id', 'src_en', 'tgt_ha', 'bio_tags', 'type'])

# 3. Clean Garbage (Remove rows that are accidentally repeated headers)
df = df[df['src_en'] != 'src_en']

# 4. Standard Cleaning
df.dropna(subset=['src_en', 'tgt_ha'], inplace=True)
df.drop_duplicates(subset=['tgt_ha'], inplace=True)

print(f" Success! Loaded {len(df)} unique samples.")
print(f" Columns: {list(df.columns)}")

# 3. SPLIT LOGIC
train_val_df, test_df = train_test_split(df, test_size=0.20, random_state=RANDOM_SEED)
train_df, val_df = train_test_split(train_val_df, test_size=0.125, random_state=RANDOM_SEED)

# 4. GENERATE FILES
def save_translation_jsonl(dataframe, filename):
    path = os.path.join(OUTPUT_DIR, filename)
    with open(path, 'w', encoding='utf-8') as f:
        for _, row in dataframe.iterrows():
            entry = {"translation": {"en": str(row['src_en']).strip(), "ha": str(row['tgt_ha']).strip()}}
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

save_translation_jsonl(train_df, "translation_train.jsonl")
save_translation_jsonl(val_df, "translation_val.jsonl")
save_translation_jsonl(test_df, "translation_test.jsonl")

# --- CLASSIFICATION ---
def heuristic_label(row):
    if row['type'] in ['FR', 'NFR']: return row['type']
    text = str(row['tgt_ha']).lower()
    for k in NFR_KEYWORDS:
        if k in text: return "NFR"
    for k in FR_KEYWORDS:
        if k in text: return "FR"
    return "FR"

# Fix SettingWithCopyWarning by using .copy()
train_df = train_df.copy()
val_df = val_df.copy()
test_df = test_df.copy()

for dset in [train_df, val_df, test_df]:
    dset['label'] = dset.apply(heuristic_label, axis=1)

train_df[['tgt_ha', 'label']].to_csv(os.path.join(OUTPUT_DIR, "class_train.csv"), index=False)
val_df[['tgt_ha', 'label']].to_csv(os.path.join(OUTPUT_DIR, "class_val.csv"), index=False)
test_df[['tgt_ha', 'label']].to_csv(os.path.join(OUTPUT_DIR, "class_test.csv"), index=False)

# --- NER/BIO ---
def format_ner_data(dataframe, filename):
    path = os.path.join(OUTPUT_DIR, filename)
    with open(path, 'w', encoding='utf-8') as f:
        for _, row in dataframe.iterrows():
            text = str(row['tgt_ha'])
            tokens = text.split()

            if pd.isna(row['bio_tags']) or str(row['bio_tags']).strip() == "":
                tags = ["O"] * len(tokens)
            else:
                tags = str(row['bio_tags']).split()

            min_len = min(len(tokens), len(tags))
            if min_len > 0:
                entry = {"tokens": tokens[:min_len], "ner_tags": tags[:min_len]}
                f.write(json.dumps(entry, ensure_ascii=False) + "\n")

format_ner_data(train_df, "ner_train.jsonl")
format_ner_data(val_df, "ner_val.jsonl")
format_ner_data(test_df, "ner_test.jsonl")

# --- MLM ---
mlm_path = os.path.join(OUTPUT_DIR, "mlm_hausa_srs.txt")
with open(mlm_path, 'w', encoding='utf-8') as f:
    for text in pd.concat([train_df, val_df])['tgt_ha']:
        text = str(text)
        if len(text.split()) > 3:
            f.write(text.strip() + "\n")

print(f"\n STAGE 3 COMPLETE. Datasets saved to: {OUTPUT_DIR}")

Mounted at /content/drive
Loaded 20 NFR keywords and 17 FR keywords.
Loading corpus from: /content/drive/MyDrive/Thesis/my_ha_srs_corpus/hausa_srs_parallel_silver.csv
 Success! Loaded 25089 unique samples.
 Columns: ['doc_id', 'src_en', 'tgt_ha', 'bio_tags', 'type']

 STAGE 3 COMPLETE. Datasets saved to: /content/drive/MyDrive/Thesis/final_datasets


**Script 3**: Expert Annotation Kit & Quality Filtering
Description: This script functions as the Human-in-the-Loop Validation and Data Quality Assurance layer. It bridges the gap between machine generation and human expertise by creating tools for manual review, while also implementing an automated filter to remove low-quality data.

Key Technical Functions:

BIO-to-Visual Conversion: Transforms abstract NER tags (e.g., B-ACTOR, I-ACTOR) into human-readable bracket notation (e.g., [ACTOR] Mai amfani), making it easier for human linguists to spot errors.

Automated Excel Generation: Uses xlsxwriter to dynamically generate an interactive Excel spreadsheet. It includes separate sheets for Classification and NER, complete with Data Validation (dropdown menus) to restrict human input to valid labels (FR, NFR, Ambiguous).

Semantic Quality Filtering: Implements the LaBSE (Language-Agnostic BERT Sentence Embedding) model to audit the dataset. It calculates the cosine similarity between every English-Hausa pair and automatically discards pairs with a semantic alignment score below 0.65, ensuring the model is not trained on "noisy" or mismatched translations.

In [None]:
!pip install -q xlsxwriter

import pandas as pd
import json
import os

# CONFIGURATION
BASE_DIR = "/content/drive/MyDrive/Thesis/final_datasets"
OUTPUT_FILE = os.path.join(BASE_DIR, "Expert_Annotation_Kit_v1.xlsx")

# Load your Test Data
class_df = pd.read_csv(os.path.join(BASE_DIR, "class_test.csv"))
ner_data = []
with open(os.path.join(BASE_DIR, "ner_test.jsonl"), 'r') as f:
    for line in f:
        ner_data.append(json.loads(line))

# HELPER: BIO TO BRACKETS

def bio_to_brackets(tokens, tags):
    """
    Converts: ["Mai", "amfani"], ["B-ACTOR", "I-ACTOR"]
    To: "[ACTOR] Mai amfani"
    This is much easier for humans to correct in Excel.
    """
    output = []
    current_entity = []
    current_label = None

    for token, tag in zip(tokens, tags):
        if tag.startswith("B-"):
            # If we were building an entity, flush it
            if current_entity:
                output.append(f"[{current_label}] {' '.join(current_entity)}")
                current_entity = []
            current_label = tag[2:] # Remove B-
            current_entity.append(token)
        elif tag.startswith("I-") and current_label == tag[2:]:
            current_entity.append(token)
        else:
            # O tag or broken sequence
            if current_entity:
                output.append(f"[{current_label}] {' '.join(current_entity)}")
                current_entity = []
                current_label = None
            output.append(token)

    # Flush remaining
    if current_entity:
        output.append(f"[{current_label}] {' '.join(current_entity)}")

    return " ".join(output)

# Prepare NER DataFrame
ner_rows = []
for i, entry in enumerate(ner_data):
    readable = bio_to_brackets(entry['tokens'], entry['ner_tags'])
    ner_rows.append({
        "ID": i+1,
        "Original_Hausa": " ".join(entry['tokens']),
        "Model_Prediction": readable,
        "Expert_Correction": readable # Pre-fill for easy editing
    })
ner_df = pd.DataFrame(ner_rows)

# EXCEL GENERATION (XlsxWriter)

writer = pd.ExcelWriter(OUTPUT_FILE, engine='xlsxwriter')
workbook = writer.book

# --- SHEET 1: FR/NFR CLASSIFICATION ---
class_df['Expert_Label'] = class_df['label'] # Pre-fill
class_df = class_df[['tgt_ha', 'label', 'Expert_Label']]
class_df.columns = ['Hausa Requirement', 'AI Predicted Label', 'Expert Correction']
class_df.to_excel(writer, sheet_name='Task 1 - Classification', index=False)

worksheet1 = writer.sheets['Task 1 - Classification']
# Add Dropdown for Column C (Expert Correction)
worksheet1.data_validation('C2:C1000', {
    'validate': 'list',
    'source': ['FR', 'NFR', 'Ambiguous', 'Not a Requirement'],
    'input_title': 'Select Label',
    'input_message': 'Is this Functional (FR) or Non-Functional (NFR)?'
})

# Formatting
header_fmt = workbook.add_format({'bold': True, 'bg_color': '#D7E4BC', 'border': 1})
worksheet1.set_column('A:A', 60) # Wide column for text
worksheet1.set_column('B:C', 15)
worksheet1.set_row(0, 20, header_fmt)

# --- SHEET 2: NER ENTITY REVIEW ---
ner_df.to_excel(writer, sheet_name='Task 2 - Entity Tagging', index=False)

worksheet2 = writer.sheets['Task 2 - Entity Tagging']
worksheet2.set_column('B:B', 50) # Original Text
worksheet2.set_column('C:C', 50) # Prediction (Visual)
worksheet2.set_column('D:D', 50) # Correction Space
worksheet2.set_row(0, 20, header_fmt)

# Add instructional text
worksheet2.write_comment('D1', 'Edit the text in this column. Ensure entities are wrapped in brackets like [ACTOR] User.')

writer.close()
print(f" Expert Package created: {OUTPUT_FILE}")

'''The Problem: Your dataset likely contains "noisy" pairs where the English and Hausa don't quite
match perfectly. Bad data hurts the model more than small data. The Fix: Use a lightweight model (LaBSE)
 to score the similarity of your sentence pairs before training.
Throw away anything below a certain score (e.g., 60%).'''

from sentence_transformers import SentenceTransformer, util

# Load a model designed for alignment
model = SentenceTransformer('sentence-transformers/LaBSE')

def filter_dataset(dataset, threshold=0.65):
    clean_data = []
    print(f" Filtering {len(dataset)} pairs...")

    # Batch processing is faster, but simple loop for clarity:
    for item in dataset:
        eng = item['en']
        hau = item['ha']

        # Calculate similarity
        embeddings = model.encode([eng, hau])
        score = util.cos_sim(embeddings[0], embeddings[1]).item()

        if score >= threshold:
            clean_data.append(item)

    print(f" Kept {len(clean_data)} pairs (Removed {len(dataset) - len(clean_data)})")
    return clean_data

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h Expert Package created: /content/drive/MyDrive/Thesis/final_datasets/Expert_Annotation_Kit_v1.xlsx


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

**Script 5: NLLB Translation Model Training (NMT + Glossary Injection) Stage 5**
Description: This script implements Objective 2 (Translation). It fine-tunes the NLLB-200 (No Language Left Behind) model specifically for the Hausa-to-English direction. Unlike standard translation, this script injects your specific software engineering glossary directly into the training loop to force the model to learn that "tsaro" means "Security" (technical) and not just "protection" (general).

Key Technical Functions:

Low-Rank Adaptation (LoRA): Instead of retraining the entire 600M parameter model (which would crash Colab), it attaches small, trainable adapter matrices to the attention layers (q_proj, v_proj), making training 90% more memory-efficient.

Glossary Oversampling: It loads your srs_glossary.json, converts term pairs (e.g., {"ha": "saba", "en": "server"}) into synthetic training sentences, and oversamples them. This creates a "dictionary bias" in the neural network.

Gradient Accumulation: Solves the GPU memory bottleneck by processing small mini-batches (4 samples) but accumulating their mathematical gradients to simulate a larger, stable batch size (16 samples) before updating the model weights.

Directional Fine-Tuning: Explicitly sets the source language to hau_Latn and target to eng_Latn, optimizing the cross-entropy loss function specifically for translating technical Hausa into English.

In [None]:
# STAGE 5: NLLB-200 TRANSLATION + GLOSSARY INJECTION
# Training Direction: HAUSA (Input) -> ENGLISH (Target)

!pip install -q transformers peft datasets sacrebleu accelerate bitsandbytes

import os
import json
import torch
import gc
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType
from google.colab import drive

# --- 1. MEMORY CLEANUP ---
torch.cuda.empty_cache()
gc.collect()

# --- CONFIGURATION ---
drive.mount("/content/drive", force_remount=True)
BASE_DIR = "/content/drive/MyDrive/Thesis"
DATA_DIR = os.path.join(BASE_DIR, "final_datasets")
OUTPUT_DIR = os.path.join(BASE_DIR, "models/nllb_lora_srs")
GLOSSARY_PATH = "/content/drive/MyDrive/Thesis/srs_glossary.json"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_ID = "facebook/nllb-200-distilled-600M"

# --- THE CRITICAL FIX: FLIPPED LANGUAGES ---
SRC_LANG = "hau_Latn"  # Source is HAUSA
TGT_LANG = "eng_Latn"  # Target is ENGLISH

def inject_glossary_terms(glossary_path, num_repeats=5):
    """
    Loads glossary and converts it into training samples.
    """
    if not os.path.exists(glossary_path):
        print(f" Glossary not found at {glossary_path}. Skipping injection.")
        return None

    print(f" Loading Glossary from {glossary_path}...")
    with open(glossary_path, 'r') as f:
        glossary = json.load(f)

    new_data = []
    for item in glossary:
        # Check glossary structure (some are list of dicts, some dict of dicts)
        # We handle the standard list format here: [{"en": "server", "ha": "saba"}, ...]
        if "en" in item and "ha" in item:
            # FIX: Training sample must match HAUSA -> ENGLISH direction
            sample = {"translation": {"ha": item["ha"], "en": item["en"]}}
            for _ in range(num_repeats):
                new_data.append(sample)

    print(f" Created {len(new_data)} glossary samples (Direction: Hausa->English)")
    return Dataset.from_list(new_data)

def train_translation_model():
    print(f"\n>>> LOADING MODEL: {MODEL_ID}")

    # 1. Load Tokenizer & Model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, src_lang=SRC_LANG, tgt_lang=TGT_LANG)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

    # 2. Apply LoRA
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj", "k_proj", "out_proj"]
    )
    model = get_peft_model(model, peft_config)
    print(f">>> LoRA APPLIED. Trainable Parameters:")
    model.print_trainable_parameters()

    model.enable_input_require_grads()
    model.config.use_cache = False

    # 3. Load Datasets
    data_files = {
        "train": os.path.join(DATA_DIR, "translation_train.jsonl"),
        "validation": os.path.join(DATA_DIR, "translation_val.jsonl")
    }
    dataset = load_dataset("json", data_files=data_files)
    train_dataset = dataset["train"]

    # 4. INJECT GLOSSARY
    glossary_dataset = inject_glossary_terms(GLOSSARY_PATH, num_repeats=5)

    if glossary_dataset:
        print(">>> MERGING GLOSSARY INTO TRAINING DATA...")
        train_dataset = concatenate_datasets([train_dataset, glossary_dataset])
        train_dataset = train_dataset.shuffle(seed=42)
        print(f">>> New Training Size: {len(train_dataset)} examples")

    # 5. Preprocessing (THE FIX IS HERE TOO)
    def preprocess_function(examples):
        # We want to translate HAUSA -> ENGLISH
        inputs = [ex["ha"] for ex in examples["translation"]] # Input: Hausa
        targets = [ex["en"] for ex in examples["translation"]] # Target: English

        model_inputs = tokenizer(inputs, max_length=128, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=128, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print(">>> TOKENIZING DATASETS...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = dataset["validation"].map(preprocess_function, batched=True)

    # 6. Training Arguments
    args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        gradient_checkpointing=False,
        num_train_epochs=5,
        weight_decay=0.01,
        save_total_limit=2,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        load_best_model_at_end=True,
        dataloader_num_workers=2
    )

    # 7. Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    )

    # 8. Train & Save
    print("\n>>> STARTING TRAINING (HAUSA -> ENGLISH)...")
    trainer.train()

    print(f"\n>>> SAVING ADAPTER TO {OUTPUT_DIR}")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(" Stage 5 Complete: Translation Model Ready (Hausa->English).")

if __name__ == "__main__":
    train_translation_model()
'''Understanding Gradient Accumulation is a great "Aha!" moment in deep learning because it lets you train massive
models on small GPUs.The Problem: The "Big Batch" BottleneckTo train a model effectively, we usually want to show
it a "Batch" of 16 or 32 sentences at once. This gives the model a stable average of errors to learn from.Scenario:
You want to train with a Batch Size of 16.The Issue: Your GPU memory (RAM) is a physical box. Fitting 16 sentences
 + the model + the calculations all at once might be too big for the box.Result: CUDA Out Of Memory
(The box explodes).The Solution: Gradient AccumulationInstead of trying to stuff all 16 sentences into the box
at once, we cheat. We cut the batch into smaller pieces (mini-batches) and process them one by one, but we wait
to update the model until we've seen all of them.Think of it like Grocery Shopping:Method A (Standard Training -
Crashes your GPU):You try to carry 16 watermelons to the checkout counter all at once.Result: You drop them.
You can't carry that much weight.Method B (Gradient Accumulation - Saves your GPU):You carry 4 watermelons to
the counter. You put them down.You go back and get 4 more. You put them down.You do this 4 times total
($4 \times 4 = 16$).Crucial Step: You do not pay (update the model weights) after every trip. You wait until
all 16 are on the counter, then you pay for everything at once.How it works mathematicallyForward Pass:
The model processes 4 sentences. It calculates the error (Gradient).Accumulate: Instead of applying that error
to change the brain immediately, it just adds that error to a temporary "pile" of errors.Repeat: It does this 4
times. RAM usage stays low because it clears the previous 4 sentences from memory before loading the next 4.Step:
Once the "pile" represents 16 sentences worth of error, the model updates its weights once.Why this fixed your
errorIn your script, I changed:per_device_train_batch_size = 4 (Carry only 4 watermelons at a time -> Low RAM usage)
gradient_accumulation_steps = 4 (Do 4 trips before paying)Effective Batch Size: $4 \times 4 = 16$.You get the
mathematical stability of a large batch (16) with the low memory footprint of a small batch (4).
The error RuntimeError: element 0 of tensors does not require grad is happening because of a conflict between Gradient Checkpointing and LoRA.

The Problem
LoRA freezes the main model (to save memory), so the input embeddings don't "require gradients."

Gradient Checkpointing (which you enabled to save memory) needs the inputs to require gradients so it can "re-play" the forward pass during training.

Result: The chain is broken. The model tries to learn, but the connection to the inputs is cut off.'''

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive

>>> LOADING MODEL: facebook/nllb-200-distilled-600M


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

>>> LoRA APPLIED. Trainable Parameters:
trainable params: 4,718,592 || all params: 619,792,384 || trainable%: 0.7613


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

 Loading Glossary from /content/drive/MyDrive/Thesis/srs_glossary.json...
 Created 0 glossary samples (Direction: Hausa->English)
>>> TOKENIZING DATASETS...


Map:   0%|          | 0/17562 [00:00<?, ? examples/s]



Map:   0%|          | 0/2509 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(



>>> STARTING TRAINING (HAUSA -> ENGLISH)...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maibrahim883[0m ([33maibrahim883-nile-university-of-nigeria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.1513,1.108679
2,1.0777,1.063812
3,1.0456,1.038067
4,1.067,1.026268
5,1.0527,1.021458



>>> SAVING ADAPTER TO /content/drive/MyDrive/Thesis/models/nllb_lora_srs
 Stage 5 Complete: Translation Model Ready (Hausa->English).


**Script 6**: Benchmark Model Training (Baseline)

This script establishes the Baseline Performance for Objective 1. It independently trains two standard industry models—mBERT and XLM-R—on your Classification and NER tasks. These serve as the "control group" to prove that your advanced Fusion Model (Script B) actually provides an improvement over standard techniques.

Key Technical Functions:

Multi-Task Architecture: It attaches two distinct prediction heads to a single transformer backbone:

NER Head: A linear layer for token-level prediction (identifying actors/entities).

Classification Head: A linear layer for sentence-level prediction (FR vs. NFR).

Memory-Safe Optimization: Implements Gradient Accumulation (simulating large batches by adding up gradients from small steps) and LoRA to allow training two massive models back-to-back on a single Colab GPU without crashing.

Custom Data Alignment: The SRSDataset class manages the complex logic of aligning token-level labels (for NER) with document-level labels (for Classification) in a single training loop.

In [None]:
# SCRIPT 6: BENCHMARK MODEL (MEMORY SAFE + METRICS)
# Independent mBERT & XLM-R Multi-Task Learning with EVALUATION

!pip install -q transformers datasets peft accelerate scikit-learn seqeval

import os
import json
import torch
import torch.nn as nn
import gc
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from peft import get_peft_model, LoraConfig, TaskType
from google.colab import drive
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# --- 1. MEMORY CLEANUP ---
torch.cuda.empty_cache()
gc.collect()

# --- CONFIGURATION ---
drive.mount("/content/drive", force_remount=True)
BASE_DIR = "/content/drive/MyDrive/Thesis/final_datasets"
MODELS_TO_TEST = ["bert-base-multilingual-cased", "xlm-roberta-base"]
MAX_LEN = 128

# --- MEMORY FIXES ---
BATCH_SIZE = 4              # Reduced to prevent OOM
ACCUMULATION_STEPS = 4      # 4 * 4 = 16 Effective Batch Size
EPOCHS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- 2. DATASET CLASS ---
class SRSDataset(Dataset):
    def __init__(self, ner_path, class_path, tokenizer, label2id_ner, label2id_class):
        self.tokenizer = tokenizer
        self.label2id_ner = label2id_ner
        self.label2id_class = label2id_class

        # Load NER Data
        self.ner_data = []
        if os.path.exists(ner_path):
            with open(ner_path, 'r') as f:
                for line in f:
                    self.ner_data.append(json.loads(line))
        else:
            print(f" Warning: NER file not found at {ner_path}")

        # Load Classification Data
        if os.path.exists(class_path):
            self.class_df = pd.read_csv(class_path)
        else:
            print(f" Warning: Class file not found at {class_path}")
            self.class_df = pd.DataFrame(columns=['label'])

    def __len__(self):
        return len(self.ner_data)

    def __getitem__(self, idx):
        # NER Item
        ner_item = self.ner_data[idx]
        tokens = ner_item['tokens']
        # Handle tags that might be missing from map
        ner_tags = [self.label2id_ner.get(t, 0) for t in ner_item['ner_tags']]

        # Classification Item (Align by Index)
        class_label = self.class_df.iloc[idx]['label'] if not self.class_df.empty else "FR"
        class_id = self.label2id_class.get(class_label, 0)

        # Tokenization
        encoding = self.tokenizer(
            tokens, is_split_into_words=True,
            padding='max_length', truncation=True, max_length=MAX_LEN,
            return_tensors='pt'
        )

        word_ids = encoding.word_ids()
        aligned_ner_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_ner_labels.append(-100)
            else:
                aligned_ner_labels.append(ner_tags[word_id] if word_id < len(ner_tags) else -100)

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'ner_labels': torch.tensor(aligned_ner_labels),
            'class_labels': torch.tensor(class_id, dtype=torch.long)
        }

# --- 3. MULTI-TASK MODEL CLASS ---
class MultiTaskBaseline(nn.Module):
    def __init__(self, model_name, num_ner_labels, num_class_labels):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.encoder.config.hidden_size

        # Task Heads
        self.ner_head = nn.Linear(self.hidden_size, num_ner_labels)
        self.class_head = nn.Linear(self.hidden_size, num_class_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # Robust pooling
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
             pooler_output = outputs.pooler_output
        else:
             pooler_output = sequence_output[:, 0]

        ner_logits = self.ner_head(sequence_output)
        class_logits = self.class_head(pooler_output)

        return ner_logits, class_logits

# --- 4. EVALUATION FUNCTION (Must be defined BEFORE training loop) ---
def evaluate_model(model, dataloader, device, label2id_ner, label2id_class):
    model.eval()

    # Storage for predictions
    all_class_preds, all_class_labels = [], []
    all_ner_preds, all_ner_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            ner_labels = batch['ner_labels'].to(device)
            class_labels = batch['class_labels'].to(device)

            ner_logits, class_logits = model(input_ids, mask)

            # 1. Process Classification
            class_preds = torch.argmax(class_logits, dim=1).cpu().numpy()
            all_class_preds.extend(class_preds)
            all_class_labels.extend(class_labels.cpu().numpy())

            # 2. Process NER
            ner_preds = torch.argmax(ner_logits, dim=2).cpu().numpy()
            ner_labels_np = ner_labels.cpu().numpy()

            for i in range(len(ner_labels_np)):
                for j in range(len(ner_labels_np[i])):
                    if ner_labels_np[i][j] != -100: # Ignore padding
                        all_ner_labels.append(ner_labels_np[i][j])
                        all_ner_preds.append(ner_preds[i][j])

    # --- METRICS ---
    # Classification
    acc = accuracy_score(all_class_labels, all_class_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_class_labels, all_class_preds, average='weighted', zero_division=0)

    print("\n" + "="*40)
    print(f" BENCHMARK RESULTS")
    print(f"CLASS | Accuracy: {acc:.4f} | F1: {f1:.4f}")

    # NER
    ner_acc = accuracy_score(all_ner_labels, all_ner_preds)
    ner_p, ner_r, ner_f1, _ = precision_recall_fscore_support(all_ner_labels, all_ner_preds, average='weighted', zero_division=0)
    print(f"NER   | Accuracy: {ner_acc:.4f} | F1: {ner_f1:.4f}")
    print("="*40 + "\n")

    return f1

# --- 5. TRAINING LOOP ---
def train_baseline(model_name):
    print(f"\n>>> TRAINING BASELINE: {model_name}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

    label2id_ner = {"O": 0, "B-ACTOR": 1, "I-ACTOR": 2, "B-SYS": 3, "I-SYS": 4, "B-DATA": 5, "I-DATA": 6, "B-QUAL": 7, "I-QUAL": 8, "B-CONS": 9, "I-CONS": 10}
    label2id_class = {"FR": 0, "NFR": 1}

    # Load Data
    train_ds = SRSDataset(f"{BASE_DIR}/ner_train.jsonl", f"{BASE_DIR}/class_train.csv", tokenizer, label2id_ner, label2id_class)
    val_ds = SRSDataset(f"{BASE_DIR}/ner_val.jsonl", f"{BASE_DIR}/class_val.csv", tokenizer, label2id_ner, label2id_class)

    if len(train_ds) == 0: return

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    model = MultiTaskBaseline(model_name, len(label2id_ner), len(label2id_class))

    # Apply LoRA
    peft_config = LoraConfig(inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.1, target_modules=["query", "value"])
    model.encoder = get_peft_model(model.encoder, peft_config)
    model.to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        optimizer.zero_grad()

        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(DEVICE)
            mask = batch['attention_mask'].to(DEVICE)
            ner_labels = batch['ner_labels'].to(DEVICE)
            class_labels = batch['class_labels'].to(DEVICE)

            ner_logits, class_logits = model(input_ids, mask)

            loss_ner = nn.CrossEntropyLoss(ignore_index=-100)(ner_logits.view(-1, len(label2id_ner)), ner_labels.view(-1))
            loss_class = nn.CrossEntropyLoss()(class_logits, class_labels)

            loss = (loss_ner + loss_class) / ACCUMULATION_STEPS
            loss.backward()

            if (i + 1) % ACCUMULATION_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item() * ACCUMULATION_STEPS

        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

        # VALIDATE AFTER EACH EPOCH
        evaluate_model(model, val_loader, DEVICE, label2id_ner, label2id_class)
        model.train()

    # Save
    save_path = f"{BASE_DIR}/models/baseline_{model_name.replace('/', '_')}"
    os.makedirs(save_path, exist_ok=True)
    torch.save(model.state_dict(), f"{save_path}/model.pt")
    print(f" Saved to {save_path}")

# --- 6. EXECUTION ---
if __name__ == "__main__":
    for m in MODELS_TO_TEST:
        try:
            train_baseline(m)
            torch.cuda.empty_cache()
        except Exception as e:
            print(f" Error: {e}")

'''
The benefit of adding Memory Cleanup is simple: It prevents your script from crashing when you
switch from one huge AI model to another.In the script I provided, you are looping through two
different models (mBERT and XLM-R). Without cleanup, the first model stays in the GPU memory while
you try to load the second one. Since a Google Colab GPU (T4) typically has 15GB of memory and these
models take up 4–8GB each (plus data), trying to hold both at once triggers the CUDA Out Of Memory error.
'''

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Mounted at /content/drive

>>> TRAINING BASELINE: bert-base-multilingual-cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Epoch 1 Loss: 0.9645

 BENCHMARK RESULTS
CLASS | Accuracy: 0.7919 | F1: 0.7769
NER   | Accuracy: 0.9521 | F1: 0.9287

Epoch 2 Loss: 0.6991

 BENCHMARK RESULTS
CLASS | Accuracy: 0.8302 | F1: 0.8235
NER   | Accuracy: 0.9608 | F1: 0.9456

Epoch 3 Loss: 0.5254

 BENCHMARK RESULTS
CLASS | Accuracy: 0.8880 | F1: 0.8876
NER   | Accuracy: 0.9686 | F1: 0.9604

Epoch 4 Loss: 0.4280

 BENCHMARK RESULTS
CLASS | Accuracy: 0.9087 | F1: 0.9082
NER   | Accuracy: 0.9732 | F1: 0.9692

Epoch 5 Loss: 0.3696

 BENCHMARK RESULTS
CLASS | Accuracy: 0.9191 | F1: 0.9179
NER   | Accuracy: 0.9771 | F1: 0.9751

 Saved to /content/drive/MyDrive/Thesis/final_datasets/models/baseline_bert-base-multilingual-cased

>>> TRAINING BASELINE: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1 Loss: 0.9226

 BENCHMARK RESULTS
CLASS | Accuracy: 0.7923 | F1: 0.7772
NER   | Accuracy: 0.9646 | F1: 0.9471

Epoch 2 Loss: 0.6967

 BENCHMARK RESULTS
CLASS | Accuracy: 0.7935 | F1: 0.7744
NER   | Accuracy: 0.9699 | F1: 0.9598

Epoch 3 Loss: 0.6487

 BENCHMARK RESULTS
CLASS | Accuracy: 0.7991 | F1: 0.7832
NER   | Accuracy: 0.9768 | F1: 0.9713

Epoch 4 Loss: 0.5988

 BENCHMARK RESULTS
CLASS | Accuracy: 0.8274 | F1: 0.8194
NER   | Accuracy: 0.9795 | F1: 0.9764

Epoch 5 Loss: 0.5271

 BENCHMARK RESULTS
CLASS | Accuracy: 0.8701 | F1: 0.8671
NER   | Accuracy: 0.9816 | F1: 0.9790

 Saved to /content/drive/MyDrive/Thesis/final_datasets/models/baseline_xlm-roberta-base


'\nThe benefit of adding Memory Cleanup is simple: It prevents your script from crashing when you\nswitch from one huge AI model to another.In the script I provided, you are looping through two\ndifferent models (mBERT and XLM-R). Without cleanup, the first model stays in the GPU memory while\nyou try to load the second one. Since a Google Colab GPU (T4) typically has 15GB of memory and these\nmodels take up 4–8GB each (plus data), trying to hold both at once triggers the CUDA Out Of Memory error.\n'

**Script 7: The Advanced Multi-View Framework**

This is the core innovation of your research—the "Novel Framework" you promised in your proposal. Unlike the benchmark (Script 6) which trains models separately, this script implements a Multi-View Learning strategy. It treats mBERT and XLM-R as two different "eyes" looking at the same Hausa text. It uses a custom Attention Mechanism to fuse their insights dynamically, ensuring that if one model is confused about a word, the other can correct it.

Key Technical Functions:

Domain-Adaptive Pretraining (DAPT): Before learning to classify requirements, the models (mBERT & XLM-R) first "read" your raw Hausa text (mlm_hausa_srs.txt) using Masked Language Modeling (MLM). This creates a specialized "Hausa-Software" vocabulary, preventing the model from confusing technical terms like "Tsaro" (Security) with general terms like "Protection".

Dual-View Tokenization: The DualViewDataset class is a custom data loader that tokenizes every sentence twice simultaneously—once for mBERT and once for XLM-R. This allows the model to process two different linguistic perspectives of the exact same input in parallel.

Multiple Instance Learning (MIL) Fusion: This is the "brain" of your framework. The MILAttentionAggregator does not just average the two models; it calculates an Attention Score for each.

Scenario: If mBERT understands the grammar but misses the technical term, and XLM-R catches the technical term but misses the grammar, the MIL layer assigns higher weight to the "smarter" model for that specific token.

Consistency Regularization Loss: To stabilize training, this script adds a mathematical penalty called l_cons. It forces the individual outputs of mBERT and XLM-R to match the final "Fused" output.

Effect: This acts like a teacher (Fused Model) correcting the students (Individual Models), ensuring robust learning even with your small dataset.

In [None]:
# SCRIPT B: ADVANCED FRAMEWORK (FULL & RESOLVED)
# DAPT + Multi-View Fusion (MIL) + Consistency Loss
!pip install -q transformers datasets peft accelerate scikit-learn

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import gc
import json
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForMaskedLM,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, TextDataset
)
from peft import get_peft_model, LoraConfig
from sklearn.metrics import accuracy_score
from google.colab import drive

# --- 1. MEMORY CLEANUP & SETUP ---
torch.cuda.empty_cache()
gc.collect()

drive.mount("/content/drive", force_remount=True)

# --- CONFIGURATION ---
BASE_DIR = "/content/drive/MyDrive/Thesis"
DATA_DIR = os.path.join(BASE_DIR, "final_datasets")
MODELS_DIR = os.path.join(BASE_DIR, "models")
os.makedirs(MODELS_DIR, exist_ok=True)

DAPT_EPOCHS = 3
FUSION_EPOCHS = 5
BATCH_SIZE = 4       # Small batch for safety
ACCUM_STEPS = 4      # Effective Batch Size = 16
LAMBDA_CONSISTENCY = 0.5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f" Running on {DEVICE}")

# PHASE 1: DOMAIN ADAPTIVE PRETRAINING (DAPT)
def run_dapt(base_model_name, save_name):
    save_path = os.path.join(MODELS_DIR, f"dapt_{save_name}")

    # Skip if already exists
    if os.path.exists(save_path):
        print(f" DAPT model found at {save_path}, skipping training.")
        return save_path

    print(f"\n>>> RUNNING DAPT: {base_model_name}")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    model = AutoModelForMaskedLM.from_pretrained(base_model_name)

    # Load MLM Data
    mlm_file = os.path.join(DATA_DIR, "mlm_hausa_srs.txt")
    if not os.path.exists(mlm_file):
        print(f" MLM file missing at {mlm_file}. Please ensure Stage 3 ran correctly.")
        return base_model_name

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=mlm_file,
        block_size=128
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir=f"./temp_dapt_{save_name}",
        overwrite_output_dir=True,
        num_train_epochs=DAPT_EPOCHS,
        per_device_train_batch_size=8,
        save_steps=1500,
        save_total_limit=1, # Keep only latest checkpoint to save space
        learning_rate=5e-5,
        fp16=torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f" DAPT Model saved to {save_path}")
    return save_path

# Run DAPT
mbert_path = run_dapt("bert-base-multilingual-cased", "mbert")
xlmr_path = run_dapt("xlm-roberta-base", "xlmr")

# PHASE 2: DUAL-VIEW DATASET

class DualViewDataset(Dataset):
    def __init__(self, ner_path, class_path, tok_m, tok_x, label2id_ner, label2id_class):
        self.tok_m = tok_m
        self.tok_x = tok_x
        self.label2id_ner = label2id_ner
        self.label2id_class = label2id_class

        self.ner_data = []
        if os.path.exists(ner_path):
            with open(ner_path, 'r') as f:
                for line in f: self.ner_data.append(json.loads(line))

        if os.path.exists(class_path):
            self.class_df = pd.read_csv(class_path)
        else:
            self.class_df = pd.DataFrame() # Handle missing file gracefully

    def __len__(self):
        return len(self.ner_data)

    def align_labels(self, tokenizer, tokens, ner_tags):
        encoding = tokenizer(tokens, is_split_into_words=True, padding='max_length',
                             truncation=True, max_length=128, return_tensors='pt')
        word_ids = encoding.word_ids()
        aligned = []
        for wid in word_ids:
            if wid is None: aligned.append(-100)
            else: aligned.append(ner_tags[wid] if wid < len(ner_tags) else -100)
        return encoding, torch.tensor(aligned)

    def __getitem__(self, idx):
        item = self.ner_data[idx]
        tokens = item['tokens']
        # Safe get for tags
        ner_tags = [self.label2id_ner.get(t, 0) for t in item.get('ner_tags', [])]

        # Safe get for class label
        if not self.class_df.empty and idx < len(self.class_df):
            class_label = self.class_df.iloc[idx]['label']
        else:
            class_label = "FR" # Default fallback

        class_id = self.label2id_class.get(class_label, 0)

        # Tokenize for BOTH views
        enc_m, labels_m = self.align_labels(self.tok_m, tokens, ner_tags)
        enc_x, labels_x = self.align_labels(self.tok_x, tokens, ner_tags)

        return {
            'ids_m': enc_m['input_ids'].squeeze(), 'mask_m': enc_m['attention_mask'].squeeze(),
            'ids_x': enc_x['input_ids'].squeeze(), 'mask_x': enc_x['attention_mask'].squeeze(),
            'ner_labels': labels_m,
            'class_labels': torch.tensor(class_id, dtype=torch.long)
        }

# PHASE 3: FUSION MODEL ARCHITECTURE
class MILAttentionAggregator(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def forward(self, view1, view2):
        # Stack views: (Batch, Seq, 2, Hidden)
        stacked = torch.stack([view1, view2], dim=2)
        scores = self.attn(stacked)
        weights = F.softmax(scores, dim=2)
        fused = torch.sum(stacked * weights, dim=2)
        return fused

class MultiViewFusionModel(nn.Module):
    def __init__(self, mbert_path, xlmr_path, num_ner, num_class):
        super().__init__()
        # Load Encoders
        self.mbert = AutoModel.from_pretrained(mbert_path)
        self.xlmr = AutoModel.from_pretrained(xlmr_path)

        self.hidden_dim = 768
        self.mil = MILAttentionAggregator(self.hidden_dim)

        self.ner_head = nn.Linear(self.hidden_dim, num_ner)
        self.class_head = nn.Linear(self.hidden_dim, num_class)

    def forward(self, ids_m, mask_m, ids_x, mask_x):
        # 1. Get Representations
        out_m = self.mbert(ids_m, attention_mask=mask_m).last_hidden_state
        out_x = self.xlmr(ids_x, attention_mask=mask_x).last_hidden_state

        # 2. Fuse (Token Level)
        fused_seq = self.mil(out_m, out_x)

        # 3. Fuse (Sentence Level - via CLS pooling)
        fused_cls = fused_seq[:, 0, :]

        # 4. Predictions
        ner_logits = self.ner_head(fused_seq)
        class_logits = self.class_head(fused_cls)

        # 5. Aux Views (for Consistency Loss)
        ner_m = self.ner_head(out_m)
        ner_x = self.ner_head(out_x)

        return ner_logits, class_logits, ner_m, ner_x

# PHASE 4: TRAINING LOOP (WITH VALIDATION)
def train_fusion_model():
    print(f"\n>>> TRAINING ADVANCED FUSION MODEL")

    # Tokenizers
    tok_m = AutoTokenizer.from_pretrained(mbert_path)
    tok_x = AutoTokenizer.from_pretrained(xlmr_path)

    # Labels
    label2id_ner = {"O": 0, "B-ACTOR": 1, "I-ACTOR": 2, "B-SYS": 3, "I-SYS": 4,
                    "B-DATA": 5, "I-DATA": 6, "B-QUAL": 7, "I-QUAL": 8, "B-CONS": 9, "I-CONS": 10}
    label2id_class = {"FR": 0, "NFR": 1}

    # Datasets
    train_ds = DualViewDataset(f"{DATA_DIR}/ner_train.jsonl", f"{DATA_DIR}/class_train.csv",
                               tok_m, tok_x, label2id_ner, label2id_class)
    val_ds = DualViewDataset(f"{DATA_DIR}/ner_val.jsonl", f"{DATA_DIR}/class_val.csv",
                             tok_m, tok_x, label2id_ner, label2id_class)

    if len(train_ds) == 0:
        print(" Dataset is empty. Cannot train.")
        return

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    # Model
    model = MultiViewFusionModel(mbert_path, xlmr_path, len(label2id_ner), len(label2id_class))

    # Apply LoRA
    peft_config = LoraConfig(inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.1,
                             target_modules=["query", "value"])
    model.mbert = get_peft_model(model.mbert, peft_config)
    model.xlmr = get_peft_model(model.xlmr, peft_config)

    model.to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    # Loop
    for epoch in range(FUSION_EPOCHS):
        model.train()
        total_loss = 0

        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            ner_fused, class_fused, ner_m, ner_x = model(
                batch['ids_m'].to(DEVICE), batch['mask_m'].to(DEVICE),
                batch['ids_x'].to(DEVICE), batch['mask_x'].to(DEVICE)
            )

            # Losses
            labels_ner = batch['ner_labels'].to(DEVICE)
            labels_class = batch['class_labels'].to(DEVICE)

            l_ner = nn.CrossEntropyLoss(ignore_index=-100)(ner_fused.view(-1, len(label2id_ner)), labels_ner.view(-1))
            #l_class = nn.CrossEntropyLoss()(class_fused, labels_class)
            # Calculate weights: Inverse of frequency
            # If FR is 90% and NFR is 10%, weight NFR 9x more.
            class_weights = torch.tensor([1.0, 5.0]).to(DEVICE) # Assuming [FR, NFR]

            l_class = nn.CrossEntropyLoss(weight=class_weights)(class_fused, labels_class)
            '''
            The Problem above: In SRS documents, FR are very common (90%),
            while NFR are rare (10%). The model might get lazy and
            just guess "FR" for everything to get 90% accuracy. The Fix: Change the Loss Function
            from standard CrossEntropyLoss to Weighted Cross Entropy or Focal Loss.
            It forces the model to pay strict attention to NFRs (Security, Performance),
            which are usually the most critical parts of a spec.
            '''
            # Consistency Loss
            l_cons = F.mse_loss(ner_m, ner_fused.detach()) + F.mse_loss(ner_x, ner_fused.detach())

            loss = (l_ner + l_class + LAMBDA_CONSISTENCY * l_cons) / ACCUM_STEPS
            loss.backward()

            if (i + 1) % ACCUM_STEPS == 0:
                optimizer.step()

            total_loss += loss.item() * ACCUM_STEPS

        # --- VALIDATION PHASE ---
        model.eval()
        val_loss = 0
        correct_class = 0
        with torch.no_grad():
            for batch in val_loader:
                nf, cf, nm, nx = model(
                    batch['ids_m'].to(DEVICE), batch['mask_m'].to(DEVICE),
                    batch['ids_x'].to(DEVICE), batch['mask_x'].to(DEVICE)
                )

                v_ln = nn.CrossEntropyLoss(ignore_index=-100)(nf.view(-1, len(label2id_ner)), batch['ner_labels'].to(DEVICE).view(-1))
                v_lc = nn.CrossEntropyLoss()(cf, batch['class_labels'].to(DEVICE))
                val_loss += (v_ln + v_lc).item()

                preds = torch.argmax(cf, dim=1)
                correct_class += (preds == batch['class_labels'].to(DEVICE)).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        acc = correct_class / len(val_ds)

        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {avg_val_loss:.4f} | Class Acc: {acc:.4f}")

    # Save
    save_final = os.path.join(MODELS_DIR, "advanced_fusion_model.pt")
    torch.save(model.state_dict(), save_final)
    print(f" Model Saved to {save_final}")

if __name__ == "__main__":
    train_fusion_model()

Mounted at /content/drive
 Running on cuda
 DAPT model found at /content/drive/MyDrive/Thesis/models/dapt_mbert, skipping training.
 DAPT model found at /content/drive/MyDrive/Thesis/models/dapt_xlmr, skipping training.

>>> TRAINING ADVANCED FUSION MODEL


The tokenizer you are loading from '/content/drive/MyDrive/Thesis/models/dapt_mbert' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
The tokenizer you are loading from '/content/drive/MyDrive/Thesis/models/dapt_xlmr' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/models/dapt_mbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to 

Epoch 1 | Train Loss: 1.3582 | Val Loss: 0.8504 | Class Acc: 0.7605
Epoch 2 | Train Loss: 0.8661 | Val Loss: 0.8412 | Class Acc: 0.7361
Epoch 3 | Train Loss: 0.8410 | Val Loss: 0.8207 | Class Acc: 0.7342
Epoch 4 | Train Loss: 0.8201 | Val Loss: 0.7510 | Class Acc: 0.7712
Epoch 5 | Train Loss: 0.7963 | Val Loss: 0.7610 | Class Acc: 0.7286
 Model Saved to /content/drive/MyDrive/Thesis/models/advanced_fusion_model.pt


Script 8: Final Evaluation Pipeline (The "Report Card")

This script serves as the Independent Examiner for your thesis. It does not train the models; instead, it subjects them to the "Final Exam" using the unseen test datasets (translation_test.jsonl and class_test.csv). It calculates the precise mathematical metrics (BLEU, Accuracy, F1-Score) that you will present in your "Results and Discussion" chapter to prove your framework works.

Key Technical Functions:

Automated Translation Scoring (BLEU & chrF): It loads your fine-tuned NLLB model and translates the hidden Hausa test set into English. It then compares these machine translations against the human reference using BLEU (Word Overlap) and chrF (Character Overlap).

Why chrF? As discussed, Hausa has complex morphology. chrF gives you credit if you get the root word right (e.g., Sabar vs Sabobin), whereas BLEU would mark it as completely wrong.

Fusion Model Reconstruction: To evaluate the Classification model, the script must "rebuild the brain" from scratch. It initializes the empty mBERT and XLM-R architectures and then loads your saved weights (advanced_fusion_model.pt) into them. This proves the model is portable and can be reused.

The "Padding Mismatch" Fix: This script solves the critical "Stacking Error" you encountered. Since mBERT and XLM-R use different vocabularies, they naturally produce different sentence lengths for the same text.

The Fix: The script forces padding='max_length' (128 tokens) for both tokenizers. This ensures that the output tensors are identical geometric shapes, allowing them to be stacked and fused without crashing.

Performance Reporting: It generates a standard Classification Report, detailing Precision, Recall, and F1-Score for both "Functional" (FR) and "Non-Functional" (NFR) classes.

This provides the granular data needed to say: "My model identifies Security requirements (NFR) with 92% precision."

In [None]:
# STAGE 8: THESIS RESULTS GENERATOR
# Purpose: Benchmarking (Quantitative) - F1, BLEU, Confusion Matrix
'''
This Script is my Quantitative Proof.

What it proves: "My system is 85% accurate."

Data used: It loads the Real Test Dataset (translation_test.jsonl) and calculates the real scores.
'''
!pip install -q transformers peft datasets sacrebleu scikit-learn accelerate bitsandbytes

import os
import torch
import json
import pandas as pd
import sacrebleu
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from peft import PeftModel, LoraConfig, get_peft_model
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

# --- CONFIGURATION ---
BASE_DIR = "/content/drive/MyDrive/Thesis"
DATA_DIR = os.path.join(BASE_DIR, "final_datasets")
MODELS_DIR = os.path.join(BASE_DIR, "models")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f" STARTING FINAL EVALUATION ON {DEVICE}...")

# PART 1: EVALUATE TRANSLATION
def evaluate_translation():
    print("\n>>> 1. EVALUATING TRANSLATION MODEL...")

    test_path = os.path.join(DATA_DIR, "translation_test.jsonl")
    if not os.path.exists(test_path):
        print(" Test file not found. Run Script 3 first.")#stage 3
        return

    src_texts = []
    ref_texts = []

    # Load Data (Hausa -> English)
    with open(test_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            src_texts.append(data['translation']['ha'])
            ref_texts.append(data['translation']['en'])

    # Load Model
    model_id = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_id, src_lang="hau_Latn")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

    adapter_path = os.path.join(MODELS_DIR, "nllb_lora_srs")
    try:
        model = PeftModel.from_pretrained(model, adapter_path)
        print(" Loaded LoRA Adapter.")
    except:
        print(" LoRA Adapter not found. Evaluating Base NLLB.")

    model.to(DEVICE)
    model.eval()

    # Generate
    hypotheses = []
    print(f"   Translating {len(src_texts)} sentences...")

    batch_size = 8
    for i in range(0, len(src_texts), batch_size):
        batch = src_texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(DEVICE)

        # Tokenizer Fix
        target_lang_id = tokenizer.convert_tokens_to_ids("eng_Latn")

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=target_lang_id,
                max_length=128
            )

        decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        hypotheses.extend(decoded)

    # Metrics
    bleu = sacrebleu.corpus_bleu(hypotheses, [ref_texts])
    chrf = sacrebleu.corpus_chrf(hypotheses, [ref_texts])

    print("\n TRANSLATION RESULTS:")
    print(f"   BLEU Score: {bleu.score:.2f}")
    print(f"   chrF Score: {chrf.score:.2f}")

    # Save output
    with open(os.path.join(BASE_DIR, "outputs", "translation_examples.txt"), "w") as f:
        for src, ref, hyp in zip(src_texts[:10], ref_texts[:10], hypotheses[:10]):
            f.write(f"Source: {src}\nRef:    {ref}\nHyp:    {hyp}\n{'-'*30}\n")

# PART 2: EVALUATE CLASSIFICATION (FUSION MODEL)
class MILAttentionAggregator(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Sequential(nn.Linear(hidden_dim, 64), nn.Tanh(), nn.Linear(64, 1))
    def forward(self, view1, view2):
        stacked = torch.stack([view1, view2], dim=2)
        scores = self.attn(stacked)
        weights = F.softmax(scores, dim=2)
        return torch.sum(stacked * weights, dim=2)

class MultiViewFusionModel(nn.Module):
    def __init__(self, mbert_path, xlmr_path, num_ner, num_class):
        super().__init__()
        self.mbert = AutoModel.from_pretrained(mbert_path)
        self.xlmr = AutoModel.from_pretrained(xlmr_path)
        self.mil = MILAttentionAggregator(768)
        self.ner_head = nn.Linear(768, num_ner)
        self.class_head = nn.Linear(768, num_class)
    def forward(self, ids_m, mask_m, ids_x, mask_x):
        out_m = self.mbert(ids_m, attention_mask=mask_m).last_hidden_state
        out_x = self.xlmr(ids_x, attention_mask=mask_x).last_hidden_state
        fused_seq = self.mil(out_m, out_x)
        fused_cls = fused_seq[:, 0, :]
        return self.class_head(fused_cls)

'''Classification Failed: stack expects each tensor to be equal size, but got [16, 92, 768] at entry 0 and [16, 94, 768] at entry 1.
    This error is happening inside your MIL Fusion Layer.
    The Problem: You are trying to fuse two models (mBERT and XLM-R) by stacking them on top of each other.
    mBERT output: [16, 92, 768] (Batch 16, 92 tokens, 768 features)
    XLM-R output: [16, 94, 768] (Batch 16, 94 tokens, 768 features)
    Because they use different tokenizers, they produce different lengths. torch.stack fails because it requires exact shape matches (92 is not equal to 94).
    The Fix: You need to force Strict Padding. Instead of padding to the "longest sentence in the batch" (which varies between tokenizers), you must pad to the fixed max_length (128).
    '''
# PART 2: EVALUATE CLASSIFICATION (FUSION MODEL)
def evaluate_classification():
    print("\n>>> 2. EVALUATING CLASSIFICATION MODEL...")

    test_df = pd.read_csv(os.path.join(DATA_DIR, "class_test.csv"))
    if test_df.empty: return

    print("   Initializing Base Models...")
    # 11 NER Labels, 2 Class Labels
    model = MultiViewFusionModel("bert-base-multilingual-cased", "xlm-roberta-base", 11, 2)

    # 2. APPLY PEFT STRUCTURE
    print("   Applying LoRA Wrappers to match training structure...")
    peft_config = LoraConfig(inference_mode=True, r=16, lora_alpha=32, lora_dropout=0.1, target_modules=["query", "value"])
    model.mbert = get_peft_model(model.mbert, peft_config)
    model.xlmr = get_peft_model(model.xlmr, peft_config)

    # 3. Load Weights
    model_path = os.path.join(MODELS_DIR, "advanced_fusion_model.pt")
    if os.path.exists(model_path):
        print(f"   Loading weights from {model_path}...")
        try:
            # Load weights
            model.load_state_dict(torch.load(model_path), strict=False)
            print("    Weights loaded successfully.")
        except Exception as e:
            print(f"    Loading Warning: {e}")
    else:
        print("    Model file not found.")
        return

    model.to(DEVICE)
    model.eval()

    # 4. Tokenizers
    tok_m = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    tok_x = AutoTokenizer.from_pretrained("xlm-roberta-base")

    preds = []
    truth = []
    label2id = {"FR": 0, "NFR": 1}

    print(f"   Classifying {len(test_df)} requirements...")

    texts = test_df['tgt_ha'].astype(str).tolist()
    labels = [label2id.get(l, 0) for l in test_df['label']]

    batch_size = 16
    for i in range(0, len(texts), batch_size):
        batch_text = texts[i:i+batch_size]

        # Force fixed length
        in_m = tok_m(batch_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).to(DEVICE)
        in_x = tok_x(batch_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).to(DEVICE)

        with torch.no_grad():
            logits = model(in_m['input_ids'], in_m['attention_mask'], in_x['input_ids'], in_x['attention_mask'])
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()

        preds.extend(batch_preds)

    truth = labels

    print("\n CLASSIFICATION RESULTS:")
    print(classification_report(truth, preds, target_names=["FR", "NFR"], digits=4))
    print(f"   Global Accuracy: {accuracy_score(truth, preds):.4f}")

# MAIN RUNNER
if __name__ == "__main__":
    try: evaluate_translation()
    except Exception as e: print(f" Translation Failed: {e}")

    try: evaluate_classification()
    except Exception as e: print(f" Classification Failed: {e}")

    print("\n Evaluation Complete.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h STARTING FINAL EVALUATION ON cuda...

>>> 1. EVALUATING TRANSLATION MODEL...


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

 Loaded LoRA Adapter.
   Translating 5018 sentences...

 TRANSLATION RESULTS:
   BLEU Score: 14.14
   chrF Score: 35.30

>>> 2. EVALUATING CLASSIFICATION MODEL...
   Initializing Base Models...


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

   Applying LoRA Wrappers to match training structure...
   Loading weights from /content/drive/MyDrive/Thesis/models/advanced_fusion_model.pt...




    Weights loaded successfully.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

   Classifying 5018 requirements...

 CLASSIFICATION RESULTS:
              precision    recall  f1-score   support

          FR     0.8284    0.7458    0.7850      3289
         NFR     0.5936    0.7062    0.6450      1729

    accuracy                         0.7322      5018
   macro avg     0.7110    0.7260    0.7150      5018
weighted avg     0.7475    0.7322    0.7367      5018

   Global Accuracy: 0.7322

 Evaluation Complete.


Script 8B Baseline Test

In [None]:
# SCRIPT 8b: FULL BASELINE EVALUATION (mBERT & XLM-R) ON TEST SET
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive

# 1. SETUP
drive.mount("/content/drive", force_remount=True)
BASE_DIR = "/content/drive/MyDrive/Thesis/final_datasets"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 2. DEFINE THE CLASS
class MultiTaskBaseline(nn.Module):
    def __init__(self, model_name, num_ner_labels, num_class_labels):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.encoder.config.hidden_size
        self.ner_head = nn.Linear(self.hidden_size, num_ner_labels)
        self.class_head = nn.Linear(self.hidden_size, num_class_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # Robust pooling logic
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
             pooler_output = outputs.pooler_output
        else:
             pooler_output = outputs.last_hidden_state[:, 0]

        ner_logits = self.ner_head(outputs.last_hidden_state)
        class_logits = self.class_head(pooler_output)
        return ner_logits, class_logits

# 3. EVALUATION FUNCTION
def evaluate_model(model_name, save_name):
    print(f"\n>>> TESTING {save_name} ON UNSEEN DATA...")

    path = f"{BASE_DIR}/models/baseline_{save_name}/model.pt"
    if not os.path.exists(path):
        print(f" Model file not found at {path}. Skipping.")
        return

    test_df = pd.read_csv(f"{BASE_DIR}/class_test.csv")
    if test_df.empty: return

    print(f" Loading {save_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = MultiTaskBaseline(model_name, 11, 2) # 11 NER, 2 Class

    try:
        model.load_state_dict(torch.load(path), strict=False)
        print(" Weights loaded successfully.")
    except Exception as e:
        print(f" Error loading weights: {e}")
        return

    model.to(DEVICE)
    model.eval()

    preds = []
    truth = []
    label2id = {"FR": 0, "NFR": 1}

    print(f" Classifying {len(test_df)} test items...")
    texts = test_df['tgt_ha'].astype(str).tolist()
    labels = [label2id.get(l, 0) for l in test_df['label']]

    batch_size = 16
    for i in range(0, len(texts), batch_size):
        batch_text = texts[i:i+batch_size]
        inputs = tokenizer(batch_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(DEVICE)

        with torch.no_grad():
            _, class_logits = model(inputs['input_ids'], inputs['attention_mask'])
            batch_preds = torch.argmax(class_logits, dim=1).cpu().numpy()

        preds.extend(batch_preds)

    truth = labels

    print("\n" + "="*40)
    print(f" {save_name} TEST RESULTS")
    print("="*40)
    print(classification_report(truth, preds, target_names=["FR", "NFR"], digits=4))
    print(f" Global Accuracy: {accuracy_score(truth, preds):.4f}")

if __name__ == "__main__":
    # Test mBERT
    evaluate_model("bert-base-multilingual-cased", "bert-base-multilingual-cased")

    # Test XLM-R
    evaluate_model("xlm-roberta-base", "xlm-roberta-base")

Mounted at /content/drive

>>> TESTING bert-base-multilingual-cased ON UNSEEN DATA...
 Loading bert-base-multilingual-cased...
 Weights loaded successfully.
 Classifying 5018 test items...

 bert-base-multilingual-cased TEST RESULTS
              precision    recall  f1-score   support

          FR     0.6783    0.9872    0.8041      3289
         NFR     0.8182    0.1093    0.1929      1729

    accuracy                         0.6847      5018
   macro avg     0.7482    0.5483    0.4985      5018
weighted avg     0.7265    0.6847    0.5935      5018

 Global Accuracy: 0.6847

>>> TESTING xlm-roberta-base ON UNSEEN DATA...
 Loading xlm-roberta-base...
 Weights loaded successfully.
 Classifying 5018 test items...

 xlm-roberta-base TEST RESULTS
              precision    recall  f1-score   support

          FR     0.6554    1.0000    0.7919      3289
         NFR     0.0000    0.0000    0.0000      1729

    accuracy                         0.6554      5018
   macro avg     0.3277   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Script 9: End-to-End SRS Generation Pipeline (The "Product")

This script represents the Application Layer of your thesis. It is the actual software tool that a user would interact with. It combines your fine-tuned Translation Model (Script 5) with a rule-based Logic Engine to automatically generate a professional IEEE 830 Software Requirements Specification (SRS) document from raw Hausa inputs.

Key Technical Functions:

Real-Time Inference: It loads the NLLB + LoRA adapter you trained and runs live translations on new, unseen Hausa sentences. This demonstrates that your model is not just for research metrics but for practical use.

Semantic Guardrails (LaBSE): This is a critical Quality Assurance (QA) feature. Before adding a requirement to the final document, the system uses the LaBSE model to calculate a "Meaning Score."

Logic: If the translation is grammatically correct but semantically wrong (Score < 0.60), the system flags it as "LOW CONFIDENCE." This prevents dangerous mistranslations in safety-critical software.

IEEE 830 Logic Engine: This function (apply_ieee_template) acts as a "Digital Business Analyst." It enforces the strict syntax of requirements engineering:

The "Shall" Constraint: It automatically converts weak words like "should" or "must" into the legally binding "shall."

Actor Injection: It ensures every Functional Requirement starts with a clear actor (e.g., "The Administrator shall...").

NFR Sub-Categorization: It uses keyword heuristics to sort Non-Functional Requirements into their correct sub-types (Security, Performance, Usability) automatically, creating a structured and readable document.

In [2]:
# STAGE 7-9: GENERATION, EVALUATION & DEPLOYMENT
'''
This Script is the Qualitative Proof.

What it proves: "I built a system that works. Here is a document it generated."

Data used: It uses Mock Data (test_inputs, y_true lists) in the if __name__ block. This is perfect for a live demo or screenshot.
'''
# STAGE 7-9: GENERATION, EVALUATION & DEPLOYMENT (FINAL)
!pip install -q sentence-transformers python-docx sacrebleu scikit-learn flask

import os
import json
import torch
import sacrebleu
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from docx import Document
from docx.shared import Pt
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# CONFIGURATION
BASE_DIR = "/content/drive/MyDrive/Thesis"
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load LaBSE Model (Supports Hausa <-> English Alignment)
print(" Loading Semantic Verification Model (LaBSE)...")
semantic_model = SentenceTransformer('sentence-transformers/LaBSE')

# CLASS 1: SRS DOCUMENT GENERATOR
class SRSGenerator:
    def __init__(self):
        self.doc = Document()

        # --- TITLE PAGE ---
        title = self.doc.add_heading('Software Requirements Specification', 0)
        title.alignment = 1  # Center

        self.doc.add_paragraph('\n\n')
        p = self.doc.add_paragraph('Generated by: Hausa-to-English NLP Framework')
        p.alignment = 1

        p2 = self.doc.add_paragraph('Thesis Project: 2026')
        p2.alignment = 1

        p3 = self.doc.add_paragraph('Abdullah Ibrahim')
        p3.alignment = 1

        p4 = self.doc.add_paragraph('MSc Software Engineering')
        p4.alignment = 1

        self.doc.add_page_break() # Start real content on Page 2
        # ------------------

    def verify_alignment(self, hausa_text, eng_translation):
        """
        Checks if the translation preserves meaning using Cross-Lingual Embeddings.
        """
        embeddings = semantic_model.encode([hausa_text, eng_translation])
        score = util.cos_sim(embeddings[0], embeddings[1]).item()
        return score, score > 0.60  # Threshold (LaBSE is robust, >0.6 is a solid match)

    def apply_ieee_template(self, req_type, translation, entities):
        """
        Advanced injection of entities into IEEE 830 syntax.
        Includes 'Shall' enforcement and NFR sub-categorization.
        """
        import re

        # 1. CLEANUP & NORMALIZE
        clean_text = translation.strip().rstrip('.')

        # 2. ENFORCE IEEE 'SHALL'
        # Replace weak modals with binding "shall"
        ieee_text = re.sub(r'\b(should|must|will|can)\b', 'shall', clean_text, flags=re.IGNORECASE)

        # 3. ACTOR HANDLING
        # We simplify the actor to just "The User" for English readability,
        # or keep your specific "Mai Amfani" if strictly required.
        actor = entities.get('ACTOR', ['The System'])[0]
        if isinstance(actor, list): actor = actor[0]
        actor = actor.strip().title()

        # --- LOGIC FOR FUNCTIONAL REQUIREMENTS (FR) ---
        if req_type == "FR":
            lower_text = ieee_text.lower()

            # FIX: Check for English synonyms of the actor to prevent duplication
            # If the sentence starts with "User", "The User", "System", "The System"...
            if any(lower_text.startswith(x) for x in ["user", "the user", "system", "the system", "admin"]):
                # Capitalize first letter just in case
                final_text = ieee_text[0].upper() + ieee_text[1:]
                return f"FR: {final_text}."

            # If no subject found, THEN prepend the Actor
            # Example: "Calculate total cost" -> "The System shall calculate total cost."
            merged_text = ieee_text[0].lower() + ieee_text[1:]
            return f"FR: {actor} shall {merged_text}."

        # --- LOGIC FOR NON-FUNCTIONAL REQUIREMENTS (NFR) ---
        elif req_type == "NFR":
            text_lower = ieee_text.lower()
            category = "General Quality"
            prefix = "demonstrate properties of"

            # Expanded Keyword Search
            if any(w in text_lower for w in ['secur', 'encrypt', 'auth', 'login', 'protect', 'firewall', 'password']):
                category = "Security"
                prefix = "enforce security measures to"
            elif any(w in text_lower for w in ['fast', 'speed', 'second', 'ms', 'time', 'latency', 'response', 'under']):
                category = "Performance"
                prefix = "adhere to performance constraints regarding"
            elif any(w in text_lower for w in ['easy', 'friendly', 'interface', 'ux', 'gui', 'learn']):
                category = "Usability"
                prefix = "provide a user interface that allows"
            elif any(w in text_lower for w in ['avail', 'reliab', 'fail', 'uptime', 'recover', 'crash', 'operate']):
                category = "Reliability"
                prefix = "ensure system reliability by"
            elif any(w in text_lower for w in ['sql', 'database', 'store', 'archive', 'data', 'integration', 'support']):
                category = "Data Integrity"
                prefix = "ensure data support for"

            # Trust the translation if it looks complete
            if "shall" in text_lower and len(text_lower.split()) > 4:
                # Capitalize first letter
                final_text = ieee_text[0].upper() + ieee_text[1:]
                return f"NFR [{category}]: {final_text}."

            return f"NFR [{category}]: The system shall {prefix} {ieee_text}."

        return translation
    def generate_docx(self, requirements_list, filename="Generated_SRS.docx"):
        """
        Groups requirements and writes to DOCX.
        """
        # Section 1: Functional
        h1 = self.doc.add_heading('1. Functional Requirements', level=1)
        if not any(r['type'] == 'FR' for r in requirements_list):
            self.doc.add_paragraph("No functional requirements identified.")

        for req in requirements_list:
            if req['type'] == 'FR':
                p = self.doc.add_paragraph(req['formatted_text'])
                p.style = 'List Bullet'

        # Section 2: Non-Functional
        h2 = self.doc.add_heading('2. Non-Functional Requirements', level=1)
        if not any(r['type'] == 'NFR' for r in requirements_list):
            self.doc.add_paragraph("No non-functional requirements identified.")

        for req in requirements_list:
            if req['type'] == 'NFR':
                p = self.doc.add_paragraph(req['formatted_text'])
                p.style = 'List Bullet'

        path = os.path.join(OUTPUT_DIR, filename)
        self.doc.save(path)
        print(f"\n SRS Document Successfully Saved: {path}")

# CLASS 2: REAL INFERENCE PIPELINE
class RealSRSPipeline:
    def __init__(self):
        print(" Loading NLLB Translation Model... This may take a minute.")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load Translation Model (NLLB + LoRA)
        base_nllb = "facebook/nllb-200-distilled-600M"
        self.trans_tokenizer = AutoTokenizer.from_pretrained(base_nllb, src_lang="hau_Latn")
        self.trans_model = AutoModelForSeq2SeqLM.from_pretrained(base_nllb)

        # Load Adapter
        adapter_path = "/content/drive/MyDrive/Thesis/models/nllb_lora_srs"
        if os.path.exists(adapter_path):
            self.trans_model = PeftModel.from_pretrained(self.trans_model, adapter_path)
            print(" LoRA Adapter Loaded Successfully.")
        else:
            print(" LoRA Adapter not found. Using base NLLB model.")

        self.trans_model.to(self.device)
        self.generator = SRSGenerator()
        print(" Service Ready.")

    def translate(self, text):
        self.trans_tokenizer.src_lang = "hau_Latn"
        inputs = self.trans_tokenizer(text, return_tensors="pt").to(self.device)
        target_lang_id = self.trans_tokenizer.convert_tokens_to_ids("eng_Latn")

        translated_tokens = self.trans_model.generate(
            **inputs,
            forced_bos_token_id=target_lang_id,
            max_length=128
        )
        return self.trans_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    def process(self, hausa_text):
        # 1. Classification Shortcut (Heuristic)
        # Expanded Hausa keywords to catch "create" (ƙirƙiri), "display" (nuna), "calculate" (ƙididdige)
        fr_keywords = [
            "zai", "iya", "shiga", "yi", "tarihi", "canza", "aika",
            "ƙirƙiri", "kirkiro", "nuna", "lissafa", "cire", "ajiye", "bincika", "rubuta"
        ]

        # Keywords that definitely signal NFR (override FR keywords if present)
        nfr_keywords = [
            "daƙiƙa", "dakika", "tsaro", "boye", "ɓoye", "kyau", "sauƙi",
            "lokaci", "%", "windows", "linux", "mysql"
        ]

        text_lower = hausa_text.lower()

        # Logic: It's FR if it has action verbs, UNLESS it has specific NFR constraint words
        is_fr = any(x in text_lower for x in fr_keywords)
        is_nfr = any(x in text_lower for x in nfr_keywords)

        if is_nfr:
            pred_type = "NFR"
        elif is_fr:
            pred_type = "FR"
        else:
            pred_type = "NFR" # Default fallback

        entities = {"ACTOR": ["Mai amfani (User)"]}

        # 2. Translation
        eng_text = self.translate(hausa_text)

        # 3. Quality Assurance (Semantic Check with LaBSE)
        score, passed = self.generator.verify_alignment(hausa_text, eng_text)

        # 4. Generate
        formatted = self.generator.apply_ieee_template(pred_type, eng_text, entities)

        return formatted, eng_text, score

# MAIN EXECUTION
if __name__ == "__main__":
    # 1. Initialize
    service = RealSRSPipeline()

    # 2. Define Demo Inputs
    test_inputs = [
        "Mai amfani zai iya shiga tsarin tare da kalmar sirri.",  # FR (Login)
        "Dole ne tsarin ya kasance yana da matukar tsaro wajen adana bayanai.", # NFR (Security)
        # Functional Requirements (FR)
        "Tsarin ya kamata ya ƙirƙiri sabuwar akauntin tare da sunan mai amfani da kalmar sirri.",
        "Tsarin ya kamata ya nuna tarihi na duk bukatawar kuɗi a cikin shafin mai amfani.",
        "Mai amfani ya kamata ya iya canza bayaninsa na akauntin a kowane lokaci.",
        "Tsarin ya kamata ya aika sakon tabbatarwa zuwa imel ɗin mai amfani.",
        "Tsarin ya kamata ya ƙididdige jimlar kuɗin a akauntin bayan kowane bukatar.",
        "Mai amfani ya kamata ya iya cire kuɗi daga akauntinsa ta ATM.",
        "Tsarin ya kamata ya nuna raunin kuɗin a cikin PDF.",
        "Tsarin ya kamata ya ajiye tarihi na duk abubuwan da aka yi.",
        "Mai amfani ya kamata ya iya canza kalmar sirrinsa ta shafin.",
        "Tsarin ya kamata ya bincika bayanan mai amfuni kafin ya bari.",
        # Non-Functional Requirements (NFR)
        "Tsarin ya kamata ya amsa a cikin ƙasa da 2 daƙiƙa a lokacin da aka bukaci.",
        "Tsarin ya kamata ya yi aiki 99.9% na lokaci ba tare da matsala ba.",
        "Duk bayanan kuɗi ya kamata a ɓoye da AES-256 encryption.",
        "Tsarin ya kamata ya ba da damar amfani akan waya da kwamfuta(na'ura mai ƙwaƙwalwa)",
        "Tsarin ya kamata ya tallafa har zuwa 1000 mai amfani a lokaci ɗaya.",
        "Interface na tsarin ya kamata ya zama mai sauƙi ga duk wanda ba ƙwararre ba.",
        "Tsarin ya kamata ya yi aiki a Windows, Linux, da macOS.",
        "Duk bukatawar shiga ya kamata a rubuta a cikin log file.",
        "Tsarin ya kamata ya ba da damar haɗuwa da MySQL da PostgreSQL.",
        "Tsarin ya kamata ya tallafa Hausa da Turanci a lokaci ɗaya."
    ]

    processed_reqs = []

    print("\n" + "="*40)
    print("      LIVE SYSTEM DEMO OUTPUT")
    print("="*40)

    for txt in test_inputs:
        formatted_out, raw_trans, score = service.process(txt)

        # Determine type for display
        rtype = "FR" if "FR:" in formatted_out else "NFR"

        print(f"\n Input (Hausa): {txt}")
        print(f" Trans (Eng):   {raw_trans}")
        # Color code the pass/fail for console readability
        pass_str = " PASS" if score > 0.6 else " LOW"
        print(f" Sem. Score:    {score:.4f} ({pass_str})")
        print(f" Final Spec:    {formatted_out}")

        req_data = {'type': rtype, 'formatted_text': formatted_out}
        processed_reqs.append(req_data)

    # 3. Generate Document
    service.generator.generate_docx(processed_reqs, "my_Thesis_Final_SRS.docx")

    print("\n DEMO COMPLETE. Please check 'my_Thesis_Final_SRS.docx' in your Drive.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h Loading Semantic Verification Model (LaBSE)...


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

 Loading NLLB Translation Model... This may take a minute.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

 LoRA Adapter Loaded Successfully.
 Service Ready.

      LIVE SYSTEM DEMO OUTPUT

 Input (Hausa): Mai amfani zai iya shiga tsarin tare da kalmar sirri.
 Trans (Eng):   User shall be able to log in to the system with a password.
 Sem. Score:    0.8661 ( PASS)
 Final Spec:    FR: User shall be able to log in to the system with a password.

 Input (Hausa): Dole ne tsarin ya kasance yana da matukar tsaro wajen adana bayanai.
 Trans (Eng):   The system must be extremely secure in storing data.
 Sem. Score:    0.9507 ( PASS)
 Final Spec:    NFR [Security]: The system shall be extremely secure in storing data.

 Input (Hausa): Tsarin ya kamata ya ƙirƙiri sabuwar akauntin tare da sunan mai amfani da kalmar sirri.
 Trans (Eng):   System should create a new account with username and password.
 Sem. Score:    0.8656 ( PASS)
 Final Spec:    FR: System shall create a new account with username and password.

 Input (Hausa): Tsarin ya kamata ya nuna tarihi na duk bukatawar kuɗi a cikin shafin mai am

Improved Script *9*

In [None]:
# SCRIPT 9: INTELLIGENT SRS GENERATION (WITH REDUNDANCY & AMBIGUITY CHECKS)
# ==================================================================================
# SCRIPT 9: INTELLIGENT SRS GENERATION (FIXED TOKENIZER)
# ==================================================================================
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from peft import PeftModel, PeftConfig
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from docx.shared import Pt, RGBColor
from google.colab import drive

# 1. SETUP ENV
drive.mount("/content/drive", force_remount=True)
BASE_DIR = "/content/drive/MyDrive/Thesis/final_datasets"
MODELS_DIR = "/content/drive/MyDrive/Thesis/models"
OUTPUT_DIR = "/content/drive/MyDrive/Thesis/generated_srs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Directory Setup Complete. Using {DEVICE}.")

# ==================================================================================
# PART A: LOAD MODELS
# ==================================================================================
def load_models():
    print("\n>>> LOADING AI MODELS...")

    # 1. Translation Model (NLLB-200)
    print("   1. Loading Translation Engine (NLLB-200)...")
    base_nllb = "facebook/nllb-200-distilled-600M"
    nllb_tokenizer = AutoTokenizer.from_pretrained(base_nllb)
    nllb_model = AutoModelForSeq2SeqLM.from_pretrained(base_nllb)

    # Load Fine-Tuned Weights
    adapter_path = f"{MODELS_DIR}/nllb_lora_fine_tuned"
    if os.path.exists(adapter_path):
         nllb_model = PeftModel.from_pretrained(nllb_model, adapter_path)
         print("      - Custom LoRA Adapter Loaded!")
    nllb_model.to(DEVICE)

    # 2. Semantic Model (LaBSE)
    print("   2. Loading Semantic Guardrail (LaBSE)...")
    labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
    labse_model = AutoModel.from_pretrained("sentence-transformers/LaBSE").to(DEVICE)

    return nllb_tokenizer, nllb_model, labse_tokenizer, labse_model

# ==================================================================================
# PART B: QUALITY ASSURANCE
# ==================================================================================

def check_ambiguity(text):
    """Scans for vague words forbidden in strict engineering."""
    forbidden_words = [
        "fast", "quickly", "slowly", "easy", "simple", "user-friendly",
        "efficient", "effective", "robust", "approximately", "about",
        "minimal", "maximize", "hope", "maybe", "sufficient", "seamlessly"
    ]
    found = [w for w in forbidden_words if f" {w} " in f" {text.lower()} "]
    if found:
        return f" [ ISO WARNING: Vague terms found: {', '.join(found)}]"
    return ""

def get_embedding(text_list, tokenizer, model, batch_size=32):
    """
    Helper for Redundancy Check - BATCHED to avoid Out of Memory Errors.
    Processes data in chunks rather than all at once.
    """
    all_embeddings = []

    # Loop through data in small batches
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i : i + batch_size]

        # Tokenize batch
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
            # Get embeddings
            batch_emb = outputs.pooler_output if hasattr(outputs, 'pooler_output') else outputs.last_hidden_state.mean(dim=1)
            # Move to CPU immediately to save GPU memory
            all_embeddings.append(batch_emb.cpu())

        # Clean up GPU memory
        del inputs, outputs, batch_emb
        torch.cuda.empty_cache()

    # Combine all batches
    if all_embeddings:
        return torch.cat(all_embeddings, dim=0).numpy()
    else:
        return np.array([])

def optimize_redundancy(requirements):
    """Removes semantic duplicates > 85% similarity"""
    if len(requirements) < 2:
        return requirements

    print(f"   ...Optimizing List ({len(requirements)} items)...")
    texts = [r['text'] for r in requirements]
    embeddings = get_embedding(texts, LABSE_TOK, LABSE_MODEL)
    sim_matrix = cosine_similarity(embeddings)

    to_remove = set()
    for i in range(len(sim_matrix)):
        for j in range(i + 1, len(sim_matrix)):
            if sim_matrix[i][j] > 0.85:
                # print(f"      [DROP] '{texts[j]}' is a duplicate of '{texts[i]}'")
                to_remove.add(j)

    optimized_list = [r for idx, r in enumerate(requirements) if idx not in to_remove]
    if to_remove:
        print(f"      - Removed {len(to_remove)} duplicates.")
    return optimized_list

# ==================================================================================
# PART C: MAIN GENERATION LOOP (FIXED)
# ==================================================================================

def generate_ieee_srs(input_file):
    print(f"\n>>> PROCESSING: {input_file}")

    # Load Data
    df = pd.read_csv(input_file)

    # Limit for demonstration if dataset is huge (optional, remove [:50] for full run)
    # df = df[:50]

    func_reqs = []
    non_func_reqs = []

    print(f"   Translating {len(df)} requirements...")

    # --- THE FIX IS IN THIS LOOP ---
    for i, row in df.iterrows():
        hausa_text = str(row['tgt_ha'])
        label = row['label']

        # Translate
        inputs = NLLB_TOK(hausa_text, return_tensors="pt").to(DEVICE)

        # FIX: Use convert_tokens_to_ids instead of lang_code_to_id
        forced_bos = NLLB_TOK.convert_tokens_to_ids("eng_Latn")

        trans_tokens = NLLB_MODEL.generate(
            **inputs,
            forced_bos_token_id=forced_bos,
            max_length=128
        )
        eng_text = NLLB_TOK.decode(trans_tokens[0], skip_special_tokens=True)

        req_obj = {'text': eng_text, 'source': hausa_text}

        if label == 'FR':
            func_reqs.append(req_obj)
        else:
            non_func_reqs.append(req_obj)

    # 2. OPTIMIZATION PHASE
    print("\n>>> RUNNING QUALITY ASSURANCE MODULE...")
    final_fr = optimize_redundancy(func_reqs)
    final_nfr = optimize_redundancy(non_func_reqs)

    # 3. DOCUMENT GENERATION
    print("\n>>> GENERATING DOCX...")
    doc = Document()

    doc.add_heading('Software Requirements Specification', 0)
    doc.add_paragraph('Generated by: NLP-Supported Low-Resource Framework')
    doc.add_paragraph('Standard: IEEE 830-1998')
    doc.add_page_break()

    doc.add_heading('1. Introduction', level=1)
    doc.add_paragraph("This document outlines the software requirements extracted from Hausa legacy data.")

    doc.add_heading('3. Specific Requirements', level=1)

    doc.add_heading('3.1 Functional Requirements', level=2)
    if not final_fr: doc.add_paragraph("No Functional Requirements found.")
    for i, req in enumerate(final_fr, 1):
        p = doc.add_paragraph()
        runner = p.add_run(f"FR-{i:03}: {req['text']}")
        runner.bold = True
        ambiguity_note = check_ambiguity(req['text'])
        if ambiguity_note:
            p.add_run(ambiguity_note).font.color.rgb = RGBColor(255, 0, 0)

    doc.add_heading('3.2 Non-Functional Requirements', level=2)
    if not final_nfr: doc.add_paragraph("No Non-Functional Requirements found.")
    for i, req in enumerate(final_nfr, 1):
        p = doc.add_paragraph()
        runner = p.add_run(f"NFR-{i:03}: {req['text']}")
        ambiguity_note = check_ambiguity(req['text'])
        if ambiguity_note:
            p.add_run(ambiguity_note).font.color.rgb = RGBColor(255, 0, 0)

    save_path = f"{OUTPUT_DIR}/Final_SRS_Report.docx"
    doc.save(save_path)
    print(f"SUCCESS! Document saved to: {save_path}")

# ==================================================================================
# EXECUTION
if __name__ == "__main__":
    NLLB_TOK, NLLB_MODEL, LABSE_TOK, LABSE_MODEL = load_models()
    INPUT_FILE = f"{BASE_DIR}/class_test.csv"
    generate_ieee_srs(INPUT_FILE)
    '''
    How to Verify This for Your Thesis
Run the script.

Open the generated .docx file.

Screenshot the Red Text: Find a line that says [⚠️ ISO WARNING: Vague terms found...]. Take a screenshot.

Caption: "Figure 4.6: Automated Ambiguity Detection enforcing ISO quality standards."

Screenshot the Console Output: Find the line that says [DROP] '...' is a duplicate. Take a screenshot.

Caption: "Figure 4.7: Redundancy Optimization removing semantic duplicates."
    '''

Mounted at /content/drive
Directory Setup Complete. Using cuda.

>>> LOADING AI MODELS...
   1. Loading Translation Engine (NLLB-200)...
   2. Loading Semantic Guardrail (LaBSE)...

>>> PROCESSING: /content/drive/MyDrive/Thesis/final_datasets/class_test.csv
   Translating 5018 requirements...


Token indices sequence length is longer than the specified maximum sequence length for this model (1107 > 1024). Running this sequence through the model will result in indexing errors



>>> RUNNING QUALITY ASSURANCE MODULE...
   ...Optimizing List (3289 items)...


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.82 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.05 GiB is free. Process 4949 has 12.69 GiB memory in use. Of the allocated memory 11.50 GiB is allocated by PyTorch, and 1.07 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Script 10: Real-World Document Processing & Batch Execution

This script is the Production Entry Point for your framework. While Script 9 handles the logic for a single sentence, Script 10 allows the system to scale. It acts as a bridge between the user's file system (Google Drive) and your AI pipeline, enabling the batch processing of entire legacy documents (PDFs, DOCX) into English SRS specifications.

Key Technical Functions:

Universal File Ingestion: It implements a robust read_document function that automatically detects file types.

PDFs: Uses pdfplumber to extract text from read-only scans or exports.

DOCX: Uses python-docx to parse editable Word documents.

TXT: Handles raw text dumps.

Batch Integration with AI Pipeline: It connects directly to the RealSRSPipeline class (from Script 9). It iterates through every extracted line of the source document, sending it to the NLLB+LoRA model for translation and the Logic Engine for formatting, handling the entire document flow in a loop.

Automated Classification Heuristics: It applies a keyword-based heuristic layer (is_nfr) specifically tuned for Hausa technical terms (e.g., "tsaro" for security, "gudu" for speed). This serves as a "Pre-Filter" to aid the final classification, ensuring that requirements are sorted correctly into Functional vs. Non-Functional sections in the output.

Error-Resilient Execution: The script includes try/except blocks inside the processing loop. If one malformed sentence crashes the model (e.g., due to special characters), the script logs the error, skips that line, and continues processing the rest of the document, ensuring the user still gets a usable result.

In [None]:
# FINAL REAL-WORLD PROCESSING SCRIPT (FIXED)
# 1. INSTALL DEPENDENCIES FIRST
!pip install -q pdfplumber python-docx

import os
import shutil
from docx import Document
import pdfplumber
from google.colab import drive

# Mount Drive if not already done
if not os.path.exists("/content/drive"):
    drive.mount("/content/drive", force_remount=True)

# --- 2. CONFIGURATION ---
BASE_DIR = "/content/drive/MyDrive/Thesis/inputs"

# CHANGE THIS to your exact filename in Google Drive
INPUT_FILE_NAME = "Online_shopping.docx"
# INPUT_FILE_NAME = "Project_Specs.pdf"

# Paths
INPUT_PATH = os.path.join(BASE_DIR, INPUT_FILE_NAME)
OUTPUT_FILENAME = f"Translated_{os.path.splitext(INPUT_FILE_NAME)[0]}.docx"
OUTPUT_PATH = os.path.join(BASE_DIR, "outputs", OUTPUT_FILENAME)

# Ensure output directory exists
os.makedirs(os.path.join(BASE_DIR, "outputs"), exist_ok=True)

# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================
def read_document(file_path):
    """Reads text from DOCX, PDF, or TXT."""
    if not os.path.exists(file_path):
        print(f" ERROR: File not found at: {file_path}")
        print(f"   Please upload '{os.path.basename(file_path)}' to your Thesis folder in Drive.")
        return []

    ext = os.path.splitext(file_path)[1].lower()
    text_lines = []
    print(f" Reading file: {file_path}")

    try:
        if ext == ".pdf":
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text: text_lines.extend(text.split('\n'))

        elif ext == ".docx":
            doc = Document(file_path)
            for p in doc.paragraphs:
                if p.text.strip(): text_lines.append(p.text)

        elif ext == ".txt":
            with open(file_path, 'r', encoding='utf-8') as f:
                text_lines = f.readlines()
        else:
            print(" Unsupported format. Use .docx, .pdf, or .txt")
            return []

    except Exception as e:
        print(f" Error reading file: {e}")
        return []

    clean_lines = [line.strip() for line in text_lines if len(line.strip()) > 5]
    print(f" Extracted {len(clean_lines)} lines.")
    return clean_lines

# 4. EXECUTION LOOP (FIXED)

if __name__ == "__main__":
    # Ensure Service is running
    if 'service' not in locals():
        print(" Initializing AI Service (This might take 30s)...")
        try:
            service = RealSRSPipeline()
        except NameError:
            print(" Error: 'RealSRSPipeline' is not defined. Please run the Stage 9 block first.")
            exit()

    # 1. Read Data
    raw_lines = read_document(INPUT_PATH)

    if raw_lines:
        processed_reqs = []
        print(f"\n Processing {len(raw_lines)} lines...")

        for i, line in enumerate(raw_lines):
            try:
                # --- AI PROCESSING (FIX IS HERE) ---
                # We now unpack the 3 values returned by the service
                formatted_text, raw_trans, score = service.process(line)

                # Heuristic for Classification
                # (You can tune these keywords for Hausa context)
                is_nfr = any(kw in line.lower() for kw in ["dole", "tsaro", "gudu", "ajiya", "samuwa", "performant"])
                req_type = "NFR" if is_nfr else "FR"

                processed_reqs.append({
                    "type": req_type,
                    "formatted_text": formatted_text # Now this is a pure string, not a tuple!
                })

                if i % 5 == 0:
                    print(f"   [{i}/{len(raw_lines)}] {formatted_text[:50]}...")

            except Exception as e:
                print(f"    Skipped line {i}: {e}")

        # 2. Save Output
        print(f"\n Saving to: {OUTPUT_PATH}")
        # Note: Ensure the filename variable is correct here
        service.generator.generate_docx(processed_reqs, OUTPUT_FILENAME)

        print("\n DONE! File saved successfully to your Drive.")
    else:
        print("\n No content found. Check your filename.")

 Reading file: /content/drive/MyDrive/Thesis/inputs/Online_shopping.docx
 Extracted 68 lines.

 Processing 68 lines...
   [0/68] FR: SRS for Online Shopping (Actor Context: Mai am...
   [5/68] FR: A document is one that describes the requireme...
   [10/68] FR: The designated system is the solution for the ...
   [15/68] NFR: The system shall demonstrate performance cons...
   [20/68] NFR: The system shall demonstrate performance cons...
   [25/68] NFR: The system shall demonstrate performance cons...
   [30/68] NFR: The system shall demonstrate performance cons...
   [35/68] NFR: The system shall demonstrate performance cons...
   [40/68] NFR: The system shall demonstrate performance cons...
   [45/68] FR: All transactions that are going on in the webs...
   [50/68] NFR: The system shall demonstrate performance cons...
   [55/68] NFR: The system shall demonstrate performance cons...
   [60/68] NFR: The system shall demonstrate performance cons...
   [65/68] FR: If a large area of the 