In [3]:
import numpy as np 
import pandas as pd 

import nltk
nltk.download('punkt_tab') 

from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anagh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
df = pd.read_csv("./SRS/software_requirements_extended.csv") 
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Type         977 non-null    object
 1   Requirement  977 non-null    object
dtypes: object(2)
memory usage: 15.4+ KB
None


In [5]:
for txt in df["Requirement"][0:10]: 
    print(txt)
    print("*****************************");

The system shall refresh the display every 60 seconds.
*****************************
The application shall match the color of the schema set forth by Department of Homeland Security
*****************************
 If projected  the data must be readable.  On a 10x10 projection screen  90% of viewers must be able to read Event / Activity data from a viewing distance of 30
*****************************
 The product shall be available during normal business hours. As long as the user has access to the client PC  the system will be available 99% of the time during the first six months of operation.
*****************************
 If projected  the data must be understandable. On a 10x10 projection screen  90% of viewers must be able to determine that Events or Activities are occuring in current time from a viewing distance of 100
*****************************
The product shall ensure that it can only be accessed by authorized users.  The product will be able to distinguish between authorized

In [36]:
df.loc[len(df)]=["new","TESTING"]
print(df.iloc[-1,:])

Type               new
Requirement    TESTING
Name: 977, dtype: object


In [6]:
corpus = df["Requirement"] 
sentences = [] 
for c in corpus:
    [sentences.append(s) for s in sent_tokenize(c)]
# print(sentences) 

In [8]:
newDf = pd.DataFrame({"sentence":sentences}) 
newDf.to_csv("./Sentences.csv",index=False)

In [61]:
import nltk

grammar = nltk.CFG.fromstring("""
  S -> NP VP
  NP -> DT JJ JJ NN | DT JJ NN | DT NN
  VP -> VBZ PP
  PP -> IN NP
  DT -> 'The' | 'the'
  JJ -> 'quick' | 'brown' | 'lazy'
  NN -> 'fox' | 'dog'
  VBZ -> 'jumps'
  IN -> 'over'
""")

parser = nltk.ChartParser(grammar)
sentence = "The quick brown fox jumps over the lazy dog".split()

for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()


(S
  (NP (DT The) (JJ quick) (JJ brown) (NN fox))
  (VP (VBZ jumps) (PP (IN over) (NP (DT the) (JJ lazy) (NN dog)))))
                      S                        
       _______________|_________                
      |                         VP             
      |                _________|___            
      |               |             PP         
      |               |     ________|___        
      NP              |    |            NP     
  ____|__________     |    |     _______|____   
 DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN
 |    |     |    |    |    |    |       |    |  
The quick brown fox jumps over the     lazy dog



In [65]:
import nltk

# Very simple grammar to catch main and subordinate clauses
grammar = nltk.CFG.fromstring("""
S -> NP VP | S Conj S
NP -> DT NN | DT JJ NN
VP -> VBZ NP | VBZ PP
PP -> IN NP
DT -> 'The' | 'the'
JJ -> 'quick' | 'brown' | 'lazy'
NN -> 'fox' | 'dog'
VBZ -> 'jumps' | 'runs'
IN -> 'over' | 'under'
Conj -> 'and' | 'but'
""")

parser = nltk.ChartParser(grammar)
sentence = "The quick fox jumps over the dog and the lazy dog runs under the fox".split()

for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()
    print("Number of clauses (S) in tree:", len([subtree for subtree in tree.subtrees(lambda t: t.label() == 'S')]))


(S
  (S
    (NP (DT The) (JJ quick) (NN fox))
    (VP (VBZ jumps) (PP (IN over) (NP (DT the) (NN dog)))))
  (Conj and)
  (S
    (NP (DT the) (JJ lazy) (NN dog))
    (VP (VBZ runs) (PP (IN under) (NP (DT the) (NN fox))))))
                                          S                                          
                 _________________________|_________________                          
                S                         |                 S                        
       _________|_________                |         ________|__________               
      |                   VP              |        |                   VP            
      |          _________|___            |        |         __________|___           
      |         |             PP          |        |        |              PP        
      |         |     ________|___        |        |        |      ________|___       
      NP        |    |            NP      |        NP       |     |            NP    


In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")

sentence = "The quick brown fox jumps over the lazy dog because it was startled by a noise."
doc = nlp(sentence)

for sent in doc.sents:
    print(sent.text)

The quick brown fox jumps over the lazy dog because it was startled by a noise.


In [16]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

sentence = "The quick brown fox jumps over the lazy dog because it was startled by a loud noise coming from the forest."

doc = nlp(sentence)

# Split into smaller clauses based on conjunctions or subordinate clauses
simplified_sentences = []
current = []

for token in doc:
    print(token)
    current.append(token.text)
    if token.dep_ in ("mark", "cc", "punct") and token.text in (",", "and", "but", "because"):
        simplified_sentences.append(" ".join(current[:-1]).strip())
        current = []
if current:
    simplified_sentences.append(" ".join(current).strip())

print("Simplified versions:")
for s in simplified_sentences:
    print("-", s)


The
quick
brown
fox
jumps
over
the
lazy
dog
because
it
was
startled
by
a
loud
noise
coming
from
the
forest
.
Simplified versions:
- The quick brown fox jumps over the lazy dog
- it was startled by a loud noise coming from the forest .


In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "The quick brown fox jumps over the lazy dog because it was startled by a noise that came from the forest."

doc = nlp(text)

def simplify_sentence(doc):
    """
    Split complex sentences into simpler ones using dependency structure.
    """
    simplified = []
    main_clause = []
    
    # Find main verbs (roots and conjuncts)
    verbs = [token for token in doc if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj")]
    
    for verb in verbs:
        # Get the subtree of each verb (its clause)
        clause = list(verb.subtree)
        clause_text = " ".join([t.text for t in clause])
        simplified.append(clause_text)
    
    return simplified

simplified_sentences = simplify_sentence(doc)

print("Simplified sentences:")
for s in simplified_sentences:
    print("-", s)
import spacy

nlp = spacy.load("en_core_web_sm")

text = "The quick brown fox jumps over the lazy dog because it was startled by a noise that came from the forest."

doc = nlp(text)

def simplify_sentence(doc):
    """
    Split complex sentences into simpler ones using dependency structure.
    """
    simplified = []
    main_clause = []
    
    # Find main verbs (roots and conjuncts)
    verbs = [token for token in doc if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj")]
    
    for verb in verbs:
        # Get the subtree of each verb (its clause)
        clause = list(verb.subtree)
        clause_text = " ".join([t.text for t in clause])
        simplified.append(clause_text)
    
    return simplified

simplified_sentences = simplify_sentence(doc)

print("Simplified sentences:")
for s in simplified_sentences:
    print("-", s)


Simplified sentences:
- The quick brown fox jumps over the lazy dog because it was startled by a noise that came from the forest .
Simplified sentences:
- The quick brown fox jumps over the lazy dog because it was startled by a noise that came from the forest .


In [11]:
!where python 


C:\Users\anagh\anaconda3\python.exe


In [12]:
!conda install -c conda-forge spacy -y


Jupyter detected...
3 channel Terms of Service accepted
Retrieving notices: done
Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: C:\Users\anagh\anaconda3

  added / updated specs:
    - spacy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.10.5  |       h4c7d964_0         153 KB  conda-forge
    catalogue-2.0.10           |  py313hfa70ccb_2          43 KB  conda-forge
    cloudpathlib-0.23.0        |     pyhd8ed1ab_0          50 KB  conda-forge
    conda-25.7.0               |  py313hfa70ccb_0         1.2 MB  conda-forge
    confection-0.1.5           |     pyhecae5ae_0          37 KB  conda-forge
    cymem-2.0.11               |  py313hfe59770_1          42 KB  conda-forge
    cython-blis-1.3.0          |  py313h8e081ca_0         3.2 MB  c



    current version: 25.5.1
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c defaults conda




In [13]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 2.9/12.8 MB 18.8 MB/s eta 0:00:01
     ------------------------ --------------- 7.9/12.8 MB 22.5 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 23.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 21.0 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
!pip install benepar
!python -m benepar.download benepar_en3

Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch>=1.6.0 (from benepar)
  Downloading torch-2.9.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting torch-struct>=0.5 (from benepar)
  Downloading torch_struct-0.5-py3-none-any.whl.metadata (4.3 kB)
Collecting tokenizers>=0.9.4 (from benepar)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting transformers>=4.2.2 (from transformers[tokenizers,torch]>=4.2.2->benepar)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting sentencepiece>=0.1.91 (from benepar)
  Downloading sentencepiece-0.2.1-cp313-cp313-win_amd64.whl.metadata (10 kB)
Collecting huggingface-hub<2.0,>=0.16.4 (from tokenizers>=0.9.4->benepar)
  Downloading huggingface_hub-1.0.1-py3-none-any.whl.metadata (13 kB)
Collecting typer-slim (from huggingface-hub<2.0,>=0.16.4->tokenizers>=0.9.4->b

  DEPRECATION: Building 'benepar' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'benepar'. Discussion can be found at https://github.com/pypa/pip/issues/6334
C:\Users\anagh\anaconda3\python.exe: No module named benepar.download


In [4]:
from transformers import pipeline

simplifier = pipeline("text2text-generation", model="t5-base", max_length=128)

text = "The system, which was designed to handle high loads, shall refresh the display every 60 seconds to ensure consistency."
output = simplifier(f"simplify: {text}")
print(output[0]['generated_text'])


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


: The system, which was designed to handle high loads, shall refresh the display every 60 seconds to ensure consistency.


In [8]:
text = "On a 10x10 projection screen  90% of viewers must be able to determine that Events or Activities are occuring in current time from a viewing distance of 100"
output = simplifier(f"simplify: {text}")
print(output[0]['generated_text'])

the following: 10x10 projection screen. 10x10 projection screen. 10x10 projection screen. distance of 100 meters. 10x10 projection screen. meters. 100 meters. 10x10 projection screen 90% of viewers must be able to determine that meters... 100 meters. 10x10 meters. 100 meters 100 meters. meters. 100 meters. 100 meters... meters.. meters. meters.


In [10]:
!pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.186.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.42.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-generativeai)
  Downloading googleapis_common_protos-1.71.0-py3-none-any.whl.met

In [22]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()  # loads .env automatically
api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)

model = genai.GenerativeModel("gemini-2.5-flash")

text = "On a 10x10 projection screen 90% of viewers must be able to determine that Events or Activities are occurring in current time from a viewing distance of 100 meters."

prompt = f"""You are a text simplification agent. Your job is to simplify text provided and return the simplified sentence.
The provided text might contain multilple clauses. The simplified sentences shouldn't contain more than 3 clauses per sentence and
the sentence lenght shouldn't be more than 40.
provide no other text with the response.
Input_text:\"{text}\""""

response = model.generate_content(prompt)
print(response.text)


On a 10x10 screen, 90% of viewers must see live events and activities from 100 meters.


In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

files = ["./XMLZIPFile/0000 - cctns.xml"
        ,"./XMLZIPFile/0000 - gamma j.xml"
        ,"./XMLZIPFile/1995 - gemini.xml"
        ,"./XMLZIPFile/1998 - themas.xml"
        ,"./XMLZIPFile/1999 - dii.xml"
        ,"./XMLZIPFile/1999 - tcs.xml"
        ,"./XMLZIPFile/2003 - qheadache.xml"
        ,"./XMLZIPFile/2005 - microcare.xml"
        ,"./XMLZIPFile/2005 - phin.xml"
        ,"./XMLZIPFile/2006 - eirene sys 15.xml"
        ,"./XMLZIPFile/2007 - get real 0.2.xml"
        ,"./XMLZIPFile/2007-eirene_fun_7-2.xml"
        ,"./XMLZIPFile/2007-ertms.xml"
        ,"./XMLZIPFile/2008 - keepass.xml"
        ,"./XMLZIPFile/2008 - peering.xml"
        ,"./XMLZIPFile/2009 - peppol approved.xml"
        ,"./XMLZIPFile/2009 - video search.xml"
        ,"./XMLZIPFile/2010-blitdraft.xml"
        ]
raw_data = [] 
for f in files: 
    tree = ET.parse(f)
    root = tree.getroot() 
    text = [elem.text.strip() for elem in root.iter() if elem.text and elem.text.strip()]
    [raw_data.append(t) for t in text]

# print(raw_data) 
corpusDf = pd.DataFrame({"sentence":raw_data}) 
corpusDf.to_csv("./srsCorpus.csv",index=False)


In [4]:
sentences = [] 
for c in corpusDf["sentence"]:
    [sentences.append(s) for s in sent_tokenize(c)]
finalSrsDf = pd.DataFrame({"sentence":sentences}) 
finalSrsDf.to_csv("./srs.csv",index=False)