Section 1

In [2]:
# Defining the folder and file paths 
# Adjust this based on the file paths each time code is ran
capability_folder = '/content/drive/MyDrive/Kidney_documents'
requirement_csv_file = '/content/drive/MyDrive/Kidney_req.csv'
output_folder = "/content/drive/MyDrive/Kidney_output/"
output_file = "/content/drive/MyDrive/Kidney_output/Comparison_Matrix.csv"


# Indicating the number of capability paragraphs per requirement
capabilities_per_req = 5


# Editing the list of kidney specific stop words
kidney_custom = ['Kidney','Renal','Nephron','Glomerulus','Urea']

Section 2

In [None]:
pip install -q textract

In [4]:
pip install -q gensim

In [None]:
import pandas as pd
import numpy as np
import os
import gensim
import csv
import textract 
import spacy
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [6]:
print("The pandas version is: ",pd.__version__)
print("The numpy version is: ",np.__version__)
print("The os version is: ", os.uname().release)
print("The csv version is: ",csv.__version__)
print("The textract version is: ")
!pip show textract | grep Version
print("The spacy version is: ",spacy.__version__)
print("The gensim version is: ", gensim.__version__)
print("Doc2Vec and TaggedDocument is from gensim")
print("The nltk version is:", nltk.__version__)
print("stopwords and word_tokenize is from nltk")

The pandas version is:  1.5.3
The numpy version is:  1.22.4
The os version is:  5.10.147+
The csv version is:  1.0
The textract version is: 
Version: 1.6.5
The spacy version is:  3.5.2
The gensim version is:  4.3.1
Doc2Vec and TaggedDocument is from gensim
The nltk version is: 3.8.1
stopwords and word_tokenize is from nltk


In [7]:
# Mounting the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Section 3

In [8]:
# importing the capability file names
folder_path = capability_folder
capability_names = []
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        file_path = os.path.join(folder_path, filename)
        text = textract.process(file_path, extension='docx')
        text = text.decode("utf-8")
        text = text.replace("/","")
        text = text.replace("\t","")
        capability_names.append(filename)
    elif filename.endswith('.pdf'):
        file_path = os.path.join(folder_path,filename)
        pdftext = textract.process(file_path, method='pdfminer')
        pdftext = pdftext.decode("utf-8")
        pdftext = pdftext.replace("/","")
        pdftext = pdftext.replace("\t","")
        capability_names.append(filename)
    
# Adding the capability names to the first column of a capability array 
capabilities = np.empty((len(capability_names), 3), dtype='object')
capabilities[:, 0] = capability_names




# Importing the capability text
folder_path = capability_folder
imported_capabilities = []
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        file_path = os.path.join(folder_path, filename)
        # Do something with the PDF file
        text = textract.process(file_path, extension='docx')
        text = text.decode("utf-8")
        text = text.replace("/","")
        text = text.replace("\t","")
        imported_capabilities.append(text)
    else:
        file_path = os.path.join(folder_path,filename)
        pdftext = textract.process(file_path, method='pdfminer')
        pdftext = pdftext.decode("utf-8")
        pdftext = pdftext.replace("/","")
        pdftext = pdftext.replace("\t","")
        imported_capabilities.append(pdftext)

# Adding the full document capability text to the 2nd column of the capability array 
capabilities[:, 1] = imported_capabilities




# Splitting the 2nd column documents into individual paragraphs and pasting it to 2nd and 3rd column
capability_paragraph = []
for row in capabilities:
    split_values = row[1].split("\n\n\n\n")
    for value in split_values:
        new_row = [row[0], value, value]
        capability_paragraph.append(new_row)




# Deleting all rows which have empty values
array = np.array(capability_paragraph)
indices = np.where(array[:, 2] == "")[0]
capability = np.delete(array, indices, axis=0)

# Deleting all rows which have "paragraphs" less than 12 words
array = np.array(capability)
min_words = 12    # change this is you want a number other than 12
indices = []
for i, row in enumerate(array):
    num_words = len(row[2].split())
    if num_words < min_words:
        indices.append(i)
capability = np.delete(array, indices, axis=0)

Section 4

In [9]:
# Loading the Stop words
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words 
stopwords = sw_spacy.union(kidney_custom)

# Removing the stop words from the second column of the capability array
for i, row in enumerate(capability):
  doc_tokens = [token.lower() for token in row[2].split() if token.lower() not in stopwords]
  nostopwords = " ".join(doc_tokens)
  capability[i][2] = nostopwords

Section 5

In [14]:
# Importing the csv file which contains the broken down requirement document into paragraphs
file_path = requirement_csv_file

os.path.exists(file_path)
df = pd.read_csv(file_path, header=None, names=['Requirement_Paragraph'])
requirement_csv = df.values.tolist()

In [15]:
# The dataframe which will contain the requirement paragraph and the corresponding score and capability
Final = pd.DataFrame(requirement_csv, columns = ['Requirement_Paragraph'])

In [16]:
# Removing the stop words from the requirement paragraphs to be used in Doc2Vec 
req_nostop = []
for req in requirement_csv:
    req_str = str(req[0])
    req_tokens = [token.lower() for token in req_str.split() if token.lower() not in stopwords]
    req_nostop.append(' '.join(req_tokens))

Section 6

In [18]:
# Building the doc2vec model
tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(capability[:,2])]
model = Doc2Vec(vector_size = 50, alpha = 0.025, min_alpha = 0.00025, min_count=1, dm =1)
max_epochs = 5   # The number of times it will run
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.0002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay 

# Saving and loading the doc2vec model 
#model.save("d2v.model")
#print("Model Saved")
#model= Doc2Vec.load("d2v.model")

number = capabilities_per_req    # The number of capability examples you want, defined at the begining of the code

# Running the model
for i in range(len(req_nostop)):
    test_data = word_tokenize(req_nostop[i])   # Tokenizes the text in the req_nostop list
    vector = model.infer_vector(test_data) # Generate a vector representation of the document

    similar_doc = model.docvecs.most_similar([vector], topn = number) # Finds the document(s) that has the most similar vector representation to "vector" based on cosine simularity (and also stores the cosine simularity score)

    for j in range(number):
      Final.loc[i, (j*3)+1] = capability[int(similar_doc[j][0])][1]
      Final.loc[i, (j*3)+2] = similar_doc[j][1]
      Final.loc[i, (j*3)+3] = capability[int(similar_doc[j][0])][0]


#print(model.docvecs['1']) # the vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data

  similar_doc = model.docvecs.most_similar([vector], topn = number) # Finds the document(s) that has the most similar vector representation to "vector" based on cosine simularity (and also stores the cosine simularity score)
  similar_doc = model.docvecs.most_similar([vector], topn = number) # Finds the document(s) that has the most similar vector representation to "vector" based on cosine simularity (and also stores the cosine simularity score)


Section 7

In [19]:
# Edit the name of the column headers
new_cols = []
for i in range(1, capabilities_per_req+1):
    new_cols.extend(['capability_{}'.format(i), 'score_{}'.format(i), 'file_name_{}'.format(i)])
Final.rename(columns=dict(zip(Final.columns[1:], new_cols)), inplace=True)

In [20]:
# Exporting the dataframe to the google drive
# The first line is creating the directory folder (defined at the beginning)
# The second line is where you can edit the name of the file (defined at the beginning)

os.makedirs(output_folder, exist_ok=True)
Final.to_csv(output_file)
