In [240]:
import pandas as pd
import spacy
import textacy
import numpy as np
from sklearn.cluster import Birch

In [231]:
pd.options.display.max_colwidth = 100

In [27]:
nlp = spacy.load('en_core_web_md')

In [2]:
download_manager_reqs = pd.read_csv('./requirement/download_manager_reqs.csv')
antivirus_reqs = pd.read_csv('./requirement/antivirus_reqs.csv')
compression_reqs = pd.read_csv('./requirement/compression_reqs.csv')
file_sharing_reqs = pd.read_csv('./requirement/file_sharing_reqs.csv')
news_reqs = pd.read_csv('./requirement/news_reqs.csv')
vpn_reqs = pd.read_csv('./requirement/vpn_reqs.csv')
web_browser_reqs = pd.read_csv('./requirement/web_browser_reqs.csv')

In [23]:
reqs_df = pd.concat([download_manager_reqs, antivirus_reqs, compression_reqs, file_sharing_reqs, news_reqs, vpn_reqs, web_browser_reqs])

In [24]:
reqs_df.drop(labels=['url', 'app'], axis=1, inplace=True)
reqs_df.reset_index(inplace=True, drop=True)

In [28]:
# add new column for requirement length
reqs_df.loc[:, 'req_length'] = reqs_df['requirement'].apply(lambda r: len(r.split()))
# add new column to store requirements as spacy documents
reqs_df.loc[:, 'spacy_req'] = reqs_df['requirement'].apply(lambda r: nlp(r))
# add new column for lemmatization doc
reqs_df.loc[:, 'spacy_req_lemma'] = reqs_df['spacy_req'].apply(lambda doc: nlp(doc[:].lemma_))

In [151]:
reqs_df.head()

Unnamed: 0,domain,requirement,req_length,spacy_req,spacy_req_lemma
0,Download Manager,Download accelerator,2,"(Download, accelerator)","(download, accelerator)"
1,Download Manager,"Browser integration: IE, Chrome, Firefox, Opera",6,"(Browser, integration, :, IE, ,, Chrome, ,, Fi...","(browser, integration, :, IE, ,, Chrome, ,, Fi..."
2,Download Manager,"Support for HTTP, HTTPS, FTP, MMS and RTSP",8,"(Support, for, HTTP, ,, HTTPS, ,, FTP, ,, MMS,...","(support, for, HTTP, ,, HTTPS, ,, FTP, ,, MMS,..."
3,Download Manager,Multi-thread support to speed up downloads,6,"(Multi, -, thread, support, to, speed, up, dow...","(multi, -, thread, support, to, speed, up, dow..."
4,Download Manager,Automatic or manual downloads,4,"(Automatic, or, manual, downloads)","(automatic, or, manual, download)"


In [305]:
reqs_df.groupby('domain').count()

Unnamed: 0_level_0,requirement,req_length,spacy_req,spacy_req_lemma
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Antivirus,1814,1814,1814,1814
Compression Tool,1327,1327,1327,1327
Download Manager,2775,2775,2775,2775
File Sharing,1848,1848,1848,1848
News,872,872,872,872
VPN,287,287,287,287
Web Browser,1756,1756,1756,1756


### Extract boilerplate attributes given a spacy doc

In [217]:
def extract_boilerplate_attributes(doc):
    verb_phrases = list(textacy.extract.pos_regex_matches(doc=doc, pattern=r'<VERB>?<ADV>*<VERB>+'))
#     noun_phrases = list(doc.noun_chunks)
    
    verbs = []
    objects = []
    details = []
    
    for ix in range(len(verb_phrases)):
        if ix < (len(verb_phrases) - 1):
            internal_noun_phrases = list(doc[verb_phrases[ix].end:verb_phrases[ix+1].start].noun_chunks)
        else:
            internal_noun_phrases = list(doc[verb_phrases[ix].end:].noun_chunks)
        
        if (len(internal_noun_phrases) > 1):
            objects.append(internal_noun_phrases[0].text)
            details.append(internal_noun_phrases[1].text)
            verbs.append(verb_phrases[ix].text)
#     verbs = [v.text for v in verb_phrases]
#     objects = [noun_phrases[i].text for i in range(len(noun_phrases)) if i % 2 == 0]
#     details = [noun_phrases[i].text for i in range(len(noun_phrases)) if i % 2 == 1]
    assert len(verbs) == len(objects) and len(objects) == len(details)
    return verbs, objects, details

### Create boilerplate attributes dataframe

In [None]:
boilerplate_df = pd.DataFrame(columns=['req_id', 'domain', 'verb', 'object', 'detail'])

req_ids = []
domains = []
verbs = []
objects = []
additional_details = []

for id, row in reqs_df.iterrows():
#     print(row['spacy_req_lemma'])
    vs, objs, dts = extract_boilerplate_attributes(row['spacy_req_lemma'])
    if not vs or not objs or not dts:
        continue
#     print(vs, objs, dts)
#     print(len(vs), len(objs), len(dts))
    assert len(vs) == len(objs) and len(objs) == len(dts)
    req_ids.extend([id] * len(vs))
    domains.extend([row['domain']] * len(vs))
    verbs.extend(vs)
    objects.extend(objs)
    additional_details.extend(dts)

print(len(domains), len(req_ids),  len(additional_details), len(verbs), len(objects))
    
boilerplate_df = pd.DataFrame({
    'req_id': req_ids,
    'domain': domains,
    'verb': verbs,
    'object': objects,
    'detail': additional_details
})



In [226]:
boilerplate_df.domain.unique()

array(['Download Manager', 'Antivirus', 'Compression Tool',
       'File Sharing', 'News', 'VPN', 'Web Browser'], dtype=object)

In [208]:
# list(textacy.extract.matches(doc=reqs_df['spacy_req_lemma'].loc[20], patterns=[{'POS': 'VERB'}, {'OP': '?'}, {'POS': 'ADV', 'OP': '*', 'POS': 'VERB', 'OP': '+'}]))

[will make]

In [233]:
# construct boilerplate requirements and spacy requirements
boilerplate_df['boilerplate_req'] = 'System shall provide the user with the ability to ' + boilerplate_df['verb'] + ' ' + boilerplate_df['object'] + ' ' + boilerplate_df['detail']
boilerplate_df.loc[:, 'spacy_boilerplate_req'] = boilerplate_df['boilerplate_req'].apply(lambda r: nlp(r))

In [236]:
boilerplate_df.head()

Unnamed: 0,req_id,domain,verb,object,detail,boilerplate_req,spacy_boilerplate_req
0,7,Download Manager,check,file integrity,md5,System shall provide the user with the ability to check file integrity md5,"(System, shall, provide, the, user, with, the, ability, to, check, file, integrity, md5)"
1,16,Download Manager,support,all version,popular browser,System shall provide the user with the ability to support all version popular browser,"(System, shall, provide, the, user, with, the, ability, to, support, all, version, popular, brow..."
2,17,Download Manager,click,a download link,a browser,System shall provide the user with the ability to click a download link a browser,"(System, shall, provide, the, user, with, the, ability, to, click, a, download, link, a, browser)"
3,17,Download Manager,accelerate,IDM support HTTP,ftp,System shall provide the user with the ability to accelerate IDM support HTTP ftp,"(System, shall, provide, the, user, with, the, ability, to, accelerate, IDM, support, HTTP, ftp)"
4,18,Download Manager,can accelerate,download,up to 5 time,System shall provide the user with the ability to can accelerate download up to 5 time,"(System, shall, provide, the, user, with, the, ability, to, can, accelerate, download, up, to, 5..."


#### Cluster and generate new requirements

In [272]:
def cluster_requirements(df):
    req_vectors = [req_vec.vector for req_vec in df['spacy_boilerplate_req']]
    NUM_CLUSTERS = 5
    brc = Birch(branching_factor=50, n_clusters=NUM_CLUSTERS, threshold=0.05, compute_labels=True)
    brc.fit(req_vectors)
    clusters = brc.predict(req_vectors)
    # indexes[i] contains indices of all requirements belong to cluster (i+1)
    indices = [np.where(clusters == i) for i in range(NUM_CLUSTERS)]
    return indices

In [291]:
BOILERPLATE = 'System shall provide the user with the ability to {} {} {}'
def make_requirements(elements):
    """
    make a requirement from boilerplate elements
    :param elements: DataFrame containing verb, object, and additional information of a requirement
    :return: a list of requirements created by combining elements value
    """

    verbs = []
    objs = []
    details = []

    for req_series in elements:
        verbs.append(req_series['verb'])
        objs.append(req_series['object'])
        details.append(req_series['detail'])

    requirements = []
    for i in range(len(verbs)):
        for j in range(len(objs)):
            for k in range(len(details)):
                if i != j and j != k:
                    requirements.append(make_requirement(verbs[i], objs[j], details[k]))

    return requirements


def make_requirement(verb, obj, detail):
    return BOILERPLATE.format(verb, obj, detail)

In [284]:
boilerplate_df['domain'].unique()

array(['Download Manager', 'Antivirus', 'Compression Tool',
       'File Sharing', 'News', 'VPN', 'Web Browser'], dtype=object)

In [294]:
download_manager_df = boilerplate_df[boilerplate_df['domain'] == 'Download Manager']
antivirus_df = boilerplate_df[boilerplate_df['domain'] == 'Antivirus']
compression_tool_df = boilerplate_df[boilerplate_df['domain'] == 'Compression Tool']
file_sharing_df = boilerplate_df[boilerplate_df['domain'] == 'File Sharing']
news_df = boilerplate_df[boilerplate_df['domain'] == 'News']
vpn_df = boilerplate_df[boilerplate_df['domain'] == 'VPN']
web_browser_df = boilerplate_df[boilerplate_df['domain'] == 'Web Browser']

In [295]:
download_manager_indices = cluster_requirements(download_manager_df)
antivirus_indices = cluster_requirements(antivirus_df)
compression_tool_indices = cluster_requirements(compression_tool_df)
file_sharing_indices = cluster_requirements(file_sharing_df)
news_indices = cluster_requirements(news_df)
vpn_indices = cluster_requirements(vpn_df)
web_browser_indices = cluster_requirements(web_browser_df)

In [304]:
def generate_requirement(indices):
    requirements = []
    for ind in indices:
        for _ in range(1):
            # print(ind)
            triple = np.random.choice(ind[0], 3)
            bl_1 = boilerplate_df.iloc[triple[0]]
            bl_2 = boilerplate_df.iloc[triple[1]]
            bl_3 = boilerplate_df.iloc[triple[2]]
            requirements.extend(make_requirements([bl_1, bl_2, bl_3]))
#     print('----------------END-----------------\n\n\n')
    return pd.DataFrame({'requirement': requirements})

In [301]:
download_manager_requirements = generate_requirement(download_manager_indices)
antivirus_requirements = generate_requirement(antivirus_indices)
compression_tool_requirements = generate_requirement(compression_tool_indices)
file_sharing_requirements = generate_requirement(file_sharing_indices)
news_requirements = generate_requirement(news_indices)
vpn_requirements = generate_requirement(vpn_indices)
web_browser_requirements = generate_requirement(web_browser_indices)

----------------END-----------------



----------------END-----------------



----------------END-----------------



----------------END-----------------



----------------END-----------------



----------------END-----------------



----------------END-----------------





In [303]:
antivirus_requirements

Unnamed: 0,requirement
0,System shall provide the user with the ability to support such weakness medium file
1,System shall provide the user with the ability to support such weakness the express interface
2,System shall provide the user with the ability to support online movie medium file
3,System shall provide the user with the ability to support online movie the feature
4,System shall provide the user with the ability to detect different format the feature
5,System shall provide the user with the ability to detect different format the express interface
6,System shall provide the user with the ability to detect online movie medium file
7,System shall provide the user with the ability to detect online movie the feature
8,System shall provide the user with the ability to can download different format the feature
9,System shall provide the user with the ability to can download different format the express inte...
