# Conceptual Modeling for User Story
A Capstone Project for Misk Data Science Immersive Course
 
Sarah A. AlQahtani

Dec 6, 2020

## Introduction

In software engineering, a user story is an informal, natural language description of one or more features of a software system. A user story is a tool used in Agile software development to capture a description of a software feature from an end-user perspective. A user story describes three componants:
- Who wants the functionality
- What functionality the end users or stakeholders want
the system to provide
- Why the end users and stakeholders need this functionality (optional).


## Dataset

Dataset: https://data.mendeley.com/datasets/7zbk8zsd8y/1

In [6]:
# load the requiered libraries
import math
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
from io import StringIO
import random
import pickle
import csv
# Reading PDFs
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
# XML
import os
from xml.etree import ElementTree
# Preprocessing
from cleantext import clean
# Spacy
import spacy
from spacy.symbols import nsubj, VERB
from spacy.symbols import VERB, NOUN
from spacy.matcher import Matcher
import random
from spacy.util import minibatch, compounding

import warnings
#nlkt
import nltk

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [7]:
def get_text(full_file):
    '''
    func: get_text(user_story.txt)
    input:user_story.pdf
    output:plain text
    main packages: os
    user_story=get_text(full_file)
    '''
    file1=open(full_file,'r',encoding='windows-1252')
    user_story_lines = file1.readlines()
    user_story2=""
    for line in user_story_lines:
        user_story2=user_story2+str(line)
    user_story22=" ".join(user_story2.split('\n'))
    return user_story22

In [8]:
def preprocess_text(user_story):
  clean_user_story=clean(user_story,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="",
    replace_with_currency_symbol="",
    lang="en"                       # set to 'de' for German special handling
    )

  return clean_user_story

## Exploratory Data Analysis

In [9]:
def annotate_text(clean_user_story):
    rnlp=spacy.load('en_core_web_sm')
    rnlp.add_pipe(rme_custom_sents,before='parser')
    #rme_df=pd.DataFrame(columns=['id','sentence','role','mean','end','start_idx','end_idx','lable','token_text'])
    nlp2=spacy.load('en_core_web_sm')
    #nlp2.add_pipe(custom_sents,before='parser')
    docc=nlp2(clean_user_story)
    rme_df=pd.DataFrame()
    i=0
    id_ls=[]
    sen_ls=[]
    for sent_i,sc in enumerate(docc.sents):
        id_ls.append(sent_i+1)
        sen_ls.append(sc.text.strip('|').strip())
    rme_df['id']=id_ls
    rme_df['sentence']=sen_ls



    ent_df=pd.DataFrame()
    #entities_dic = {} 
    C2_start=[]
    C2_end=[]
    C2_lable=[]
    C2_text=[]
    C2_sent=[]
    c2_matcher=Matcher(nlp2.vocab)
    c2_pattern = [{'POS': 'NOUN'}]
    docc=nlp2(clean_user_story)
    c2_matcher.add('c2_pattern',None,c2_pattern)

    C3_start=[]
    C3_end=[]
    C3_lable=[]
    C3_text=[]
    C3_sent=[]
    c3_matcher=Matcher(nlp2.vocab)
    c3_pattern = [{'DEP': 'nsubj','POS': 'NOUN'}]
    c3_matcher.add('c3_pattern',None,c3_pattern)

    C4_start=[]
    C4_end=[]
    C4_lable=[]
    C4_text=[]
    C4_sent=[]
    c4_matcher=Matcher(nlp2.vocab)
    c4_pattern = [{'DEP': 'compound'}]
    c4_matcher.add('c4_pattern',None,c4_pattern)

    H2_start=[]
    H2_end=[]
    H2_lable=[]
    H2_text=[]
    H2_sent=[]

    R5_start=[]
    R5_end=[]
    R5_lable=[]
    R5_text=[]
    R5_sent=[]
    #for match_id,start,end in c2_matcher(docc):
        #span=docc[start:end]#C2
        #C2_start.append(span.start_char)
        #C2_end.append(span.end_char)
        #C2_lable.append("C2")
        #C2_text.append(span.text)
        #C2_sent.append(span.sent.text)

    for match_id,start,end in c3_matcher(docc):
        span=docc[start:end]#C3
        C3_start.append(span.start_char)
        C3_end.append(span.end_char)
        C3_lable.append("C3")
        C3_text.append(span.text)
        C3_sent.append(span.sent.text)

    ent_df['start_idx']=C3_start+C4_start
    ent_df['end_idx']=C3_end+C4_end
    ent_df['lable']=C3_lable+C4_lable
    #ent_df['text']=C2_text+C3_text+C4_text+H2_text+R5_text
    #ent_df['sentence']=C2_sent+C3_sent+C4_sent+H2_sent+R5_sent
    ent_df
    #temp_df=pd.merge(rme_df, ent_df, on='sentence')
    #temp_df

    rel_df=pd.DataFrame()
    V_start=[]
    V_end=[]
    V_lable=[]
    V_text=[]
    V_sent=[]
    V_matcher=Matcher(nlp2.vocab)
    v_pattern = [{'POS': 'VERB'}]
    #docc=nlp2(clean_user_story)
    V_matcher.add('v_pattern',None,v_pattern)

    Vp_start=[]
    Vp_end=[]
    Vp_lable=[]
    Vp_text=[]
    Vp_sent=[]
    Vp_matcher=Matcher(nlp2.vocab)
    vp_pattern = [{'POS': 'VERB'},
                    {'DEP':'prep'}]
    #docc=nlp2(clean_user_story)
    Vp_matcher.add('vp_pattern',None,vp_pattern)

    dobj_start=[]
    dobj_end=[]
    dobj_lable=[]
    dobj_text=[]
    dobj_sent=[]
    dobj_matcher=Matcher(nlp2.vocab)
    dobj_pattern = [{'DEP': 'dobj', 'POS': 'NOUN'}]
    dobj_matcher.add('dobj_pattern',None,dobj_pattern)

    iobj_start=[]
    iobj_end=[]
    iobj_lable=[]
    iobj_text=[]
    iobj_sent=[]
    iobj_matcher=Matcher(nlp2.vocab)
    iobj_pattern = [{'DEP': 'iobj', 'POS': 'NOUN'}]
    iobj_matcher.add('iobj_pattern',None,iobj_pattern)

    pobj_start=[]
    pobj_end=[]
    pobj_lable=[]
    pobj_text=[]
    pobj_sent=[]
    pobj_matcher=Matcher(nlp2.vocab)
    pobj_pattern = [{'DEP': 'pobj', 'POS': 'NOUN'}]
    pobj_matcher.add('pobj_pattern',None,pobj_pattern)

    for match_id,start,end in V_matcher(docc):
        span=docc[start:end]
        V_start.append(span.start_char)
        V_end.append(span.end_char)
        V_lable.append("R2")
        V_text.append(span.text)
        V_sent.append(span.sent.text)
        #for token in span:
            #if check_verb(token)=='TRANVERB':


            

    #for match_id,start,end in Vp_matcher(docc):
        #   span=docc[start:end]#R4
        #  Vp_start.append(span.start_char)
        # Vp_end.append(span.end_char)
        # Vp_lable.append("R4")
        # Vp_text.append(span.text)
        #Vp_sent.append(span.sent.text)


    for match_id,start,end in dobj_matcher(docc):
        span=docc[start:end]
        dobj_start.append(span.start_char)
        dobj_end.append(span.end_char)
        dobj_lable.append("C2")
        dobj_text.append(span.text)
        dobj_sent.append(span.sent.text)

    for match_id,start,end in pobj_matcher(docc):
        span=docc[start:end]
        pobj_start.append(span.start_char)
        pobj_end.append(span.end_char)
        pobj_lable.append("C2")
        pobj_text.append(span.text)
        pobj_sent.append(span.sent.text)    

    rel_df['start_idx']=V_start+dobj_start+pobj_start
    rel_df['end_idx']=V_end+dobj_end+pobj_start
    rel_df['lable']=V_lable+dobj_lable+pobj_lable
    #rel_df['text']=V_text+Vp_text+dobj_text+pobj_text
    #rel_df['sentence']=V_sent+Vp_sent+dobj_sent+pobj_sent
    #rel_temp_df=pd.merge(rme_df, rel_df, on='sentence')
    #full_df=pd.concat([temp_df,rel_temp_df])
    #full_df=full_df.drop_duplicates()

    full_df1=pd.concat([ent_df,rel_df])
    full_df1=full_df1.drop_duplicates()
    label_ls=[]
    for index,row in full_df1.iterrows():
        label_ls.append((row['start_idx'],row['end_idx'],row['lable']))
    label_dic={}
    label_dic['entities']=label_ls


    return label_dic


In [10]:
def generate_train_data(train_data_tp):
    #train_data_tp=(clean_user_story,labels_dic)
    text_file = open("train_data.txt", "w")
    text_file.write(str(train_data_tp))
    text_file.close()
    return text_file

In [11]:
rootdir='/Users/alqahtsa/py_projects/uml_generator/user_stories_txt'
#splitting data
train_set=math.floor(len([file for file in os.listdir(rootdir)])*.7)# 70% of tha data 

train_data_ls=[]
for subdir, dirs, files in os.walk(rootdir):
    for i,file in enumerate(files):
        if i<train_set:
            filepath = subdir + os.sep + file
            user_story=get_text(filepath)
            clean_user_story=preprocess_text(user_story)
            labels_dic=annotate_text(clean_user_story)
            train_data_tp=(clean_user_story,labels_dic)
            train_data_ls.append((train_data_tp))
generate_train_data(train_data_ls)

<_io.TextIOWrapper name='train_data.txt' mode='w' encoding='UTF-8'>

In [12]:
pkl_file=open('train_data.pkl','wb')
pickle.dump(train_data_ls,pkl_file)
pkl_file.close()


In [13]:
train_data=pickle.load(open('train_data.pkl','rb'))

In [14]:
nlp10=spacy.blank('en')
def train_model(train_data):
    if 'ner' not in nlp10.pipe_names:
        ner=nlp10.create_pipe('ner')
        nlp10.add_pipe(ner,last=True)
    for _,annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp10.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp10.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        
        nlp10.begin_training()
        for itn in range(30):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp10.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

In [15]:
train_model(train_data)


  proc.begin_training(
  proc.begin_training(
Losses {'ner': 26684.57846069336}
Losses {'ner': 19113.793411254883}
Losses {'ner': 11219.069913864136}
Losses {'ner': 10911.942171096802}
Losses {'ner': 10916.95958328247}
Losses {'ner': 10110.30926322937}
Losses {'ner': 9498.654777526855}
Losses {'ner': 8477.914203643799}
Losses {'ner': 8076.770166397095}
Losses {'ner': 7197.0363845825195}
Losses {'ner': 7090.1977071762085}
Losses {'ner': 6427.361822605133}
Losses {'ner': 5912.616696357727}
Losses {'ner': 5623.343173742294}
Losses {'ner': 5477.0849142074585}
Losses {'ner': 4835.527111530304}
Losses {'ner': 4811.207501173019}
Losses {'ner': 4377.280557155609}
Losses {'ner': 3970.2379789352417}
Losses {'ner': 3767.6652598381042}
Losses {'ner': 3509.750184059143}
Losses {'ner': 3200.132840871811}
Losses {'ner': 3229.917339324951}
Losses {'ner': 2894.1914558410645}
Losses {'ner': 2834.4989132881165}
Losses {'ner': 2636.183746576309}
Losses {'ner': 2667.0631017684937}
Losses {'ner': 2457.19084

In [17]:
ner_model=spacy.load('ner_model2')

In [18]:
p='/Users/alqahtsa/py_projects/uml_generator/user_stories_txt/g05-openspending.txt'
test_user_story=get_text(p)
test_clean_user_story=preprocess_text(user_story)
tdoc=ner_model(test_clean_user_story)
for ent in tdoc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

R2                            - want
R2                            - deposit
R2                            - maintain
C2                            - datasets
R2                            - need
R2                            - install
R2                            - learn
C2                            - software
R2                            - deposit
R2                            - want
C2                            - interface
R2                            - feel
R2                            - like
R2                            - joined
R2                            - want
R2                            - deposit
R2                            - maintain
C2                            - datasets
R2                            - managing
C2                            - outputs
R2                            - want
R2                            - deposit
R2                            - maintain
C2                            - datasets
R2                            - continue
R2           

In [3]:
def custom_sents(docx):
    for token in docx[:-1]:
        if token.text=='|':
            docx[token.i].is_sent_start=True
    return docx


In [4]:
def rme_custom_sents(docx):
    for token in docx[:-1]:
        if token.text==',' and docx[token.i+1].text=='i':
            docx[token.i].is_sent_start=True
        elif token.text==',' and docx[token.i+1].text=='so':
            docx[token.i].is_sent_start=True
    return docx

In [5]:
def check_verb(token):
    """Check verb type given spacy token"""
    if token.pos_ == 'VERB':
        indirect_object = False
        direct_object = False
        for item in token.children:
            if(item.dep_ == "iobj" or item.dep_ == "pobj"):
                indirect_object = True
            if (item.dep_ == "dobj" or item.dep_ == "dative"):
                direct_object = True
        if indirect_object and direct_object:
            return 'DITRANVERB'
        elif direct_object and not indirect_object:
            return 'TRANVERB'
        elif not direct_object and not indirect_object:
            return 'INTRANVERB'
        else:
            return 'VERB'
    else:
        return token.pos_