In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from gensim.models.doc2vec import Doc2Vec

import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import docx2txt
import docx
from docx import Document
import re

from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, paired_cosine_distances
from sentence_transformers import SentenceTransformer

# https://regex101.com/r/3UcY7D/1
# https://us-east-2.console.aws.amazon.com/kendra/home?region=us-east-2#indexes/b67b7f16-e1ea-45b9-beec-2acbca8492af/search

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
######################
# PARAMS
######################
wd = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\SOP\DocSim'

In [3]:
######################
# FUNCTIONS
######################

def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lammetize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words

In [4]:
######################
# SEARCH QUERY
######################
query = 'market researcher'

document = Document()
document.add_paragraph(query)

output_file = 'data/all/search.docx'
path = os.path.join(wd, output_file)
document.save(path)

In [5]:
######################
# DATA
######################
comp_docs=[]

path = os.path.join(wd, 'data/all/')
doc_names = os.listdir(path)

for name in doc_names:
    input_file='data/all/'+ name
    path = os.path.join(wd, input_file)
    doc = docx2txt.process(path)
    comp_docs.append(doc)

In [7]:
######################
# EXTRACT TITLE
######################

doc_data = pd.DataFrame()
doc_titles_arr = []

for name in doc_names:
    input_file='data/all/'+ name
    path = os.path.join(wd, input_file)
    doc = docx2txt.process(path)

    # Replace tabs and newlines
    doc=doc.replace("\n", " ")
    doc=doc.replace("\t", " ")

    # Get string after 'title'
    q = re.search('Title:(.*)', doc)
    if q is not None:
        q=q.group(1)
        
        # Get string of ending text
        pattern = r"(\w+)*:"
        end_list=re.findall(pattern, q)
        end_str=end_list[0]
        
        # Get document title
        my_regex = r"Title:(.*?)" + re.escape(end_str)
        
        q = re.search(my_regex, doc)
        doc_title=q.group(1).strip()
        print(doc_title)
    else:
        doc_title=name
        print(doc_title)
        
    doc_titles_arr.append(doc_title)

doc_data = pd.DataFrame({'doc_names': doc_names, 'doc_titles': doc_titles_arr})

output_file = 'outputs/doc_titles.csv'
path = os.path.join(wd, output_file)
doc_data.to_csv(path)

Communications Policy for Established Markets
Use of Externally Sourced Real World Data
Sampling Policy
Global Policy on Medical Affairs and Commercial Activities
Global Market Research Policy
Global Policy on Patient Support Programs
Global Policy on Interactions with Patients and Patient Organizations
Astellas Social Contribution Policy
search.docx
Analytic Environment Management
Analytic Project Management
Data Onboarding and Management
Astellas US Corporate Social Media Channel Monitoring and Adverse Event Reporting
EST SOP for Mobile App Lifecycle Management
EST SOP for Website Lifecycle Management
Patient Access Initiatives
Social Media Management for Established Markets
Medical Affairs Sponsorships for Research
Annual Justification of Commercial Advisory Boards
Disease State Awareness
Astellas Primary Market Research
Free Text Monitoring
Grants to HCOs to Support HCP Congress Attendance
Technical Writer JD.docx
User Registration
Incident Management and Breach Notification
Analyt

In [8]:
######################
# EXTRACT HEADINGS + CONTENT
######################
doc_parsed = pd.DataFrame()
text_arr = []
headings_arr = []
doc_names_arr = []

for name in doc_names:
    print(name)
    toc=[]
    headings = []
    doc_names2 = []

    input_file='data/all/'+ name
    path = os.path.join(wd, input_file)
    doc = docx.Document(path)
    
    # Get Headings
    for i in range(0,len(doc.paragraphs)):
        # print(doc.paragraphs[i].style.name)
        # print(doc.paragraphs[i].text)
        
        if (doc.paragraphs[i].style.name in ['TOC Title']) or (doc.paragraphs[i].text=='TABLE OF CONTENTS'):
            toc.append(doc.paragraphs[i].text)
            
        if (len(toc)==0) and (doc.paragraphs[i].style.name in ['toc 1','toc 2']):
            toc.append(doc.paragraphs[i].text)
                
            
        if doc.paragraphs[i].style.name=='Heading 1':
            headings.append(doc.paragraphs[i].text)

    # Get text following the Headings
    doc = docx2txt.process(path)
    doc=doc.replace("\n", " ")
    doc=doc.replace("\t", " ")
    
    doc2parse=doc

    # Extract Table of Contents of there is one
    if len(toc)>0:
        my_regex = re.escape(headings[0].strip()) + r"(.*?)" + re.escape(headings[0].strip())
        q = re.search(my_regex, doc2parse)
        text_arr.append(q.group(1))

        # Remove Table of Contents from doc2parse
        doc2parse = doc2parse.replace(q.group(1),'')
    
    for j in range(0,len(headings)-1):
        my_regex = re.escape(headings[j].strip()) + r" (.*?)" + re.escape(headings[j+1].strip())
        q = re.search(my_regex, doc2parse)
        text_arr.append(q.group(1))
    
    # Get text of last header
    if len(headings)>0:
        my_regex = re.escape(headings[len(headings)-1].strip()) + r".*"
        q = re.search(my_regex, doc2parse)
        text_arr.append(q.group(0))

    # Finalize headings array
    toc = list(filter(None, toc))
    headings_arr=headings_arr+toc+headings
    
    # Finalize doc_names array
    doc_names2 = [name for i in range(0, len(toc+headings))]
    doc_names_arr=doc_names_arr+doc_names2
    print(len(doc_names_arr))
    print(len(headings_arr))
    print(len(text_arr))
doc_parsed = pd.DataFrame({'doc_names': doc_names_arr, 'headings': headings_arr, 'text': text_arr})
    
output_file = 'outputs/docs_parsed.csv'
path = os.path.join(wd, output_file)
doc_parsed.to_csv(path)

POL-162.docx
10
10
10
POL-169 v5_Clean.docx
19
19
19
POL-319.docx
28
28
28
POL-399.docx
37
37
37
POL-409.docx
45
45
45
POL-879.docx
56
56
56
POL-891.docx
65
65
65
POL-928.docx
67
67
67
search.docx
67
67
67
SOP-1464 v2.0.docx
76
76
76
SOP-1465 v4.0.docx
85
85
85
SOP-1466 v3.0.docx
94
94
94
SOP-1704.docx
104
104
104
SOP-1771.docx
113
113
113
SOP-1772.docx
122
122
122
SOP-1877.docx
132
132
132
SOP-1921.docx
143
143
143
SOP-1943.docx
152
152
152
SOP-2754.docx
160
160
160
SOP-3088.docx
171
171
171
SOP-3142.docx
180
180
180
SOP-3176.docx
189
189
189
SOP-3187.docx
199
199
199
Technical Writer JD.docx
199
199
199
WPD-622 v3.0.docx
208
208
208
WPD-623 v2.0.docx
217
217
217
WPD-624 v4.0.docx
226
226
226
WPD-625 v3.0.docx
235
235
235
WPD-626 v2.0.docx
244
244
244
WPD-628 v2.0.docx
253
253
253
WPD-630 v2.0.docx
262
262
262
WPD-631 v2.0.docx
272
272
272
WPD-632 v2.0.docx
281
281
281
WPD-634 v2.0.docx
290
290
290


# Doc2Vec

In [66]:
######################
# DOC2VEC MODEL
######################
input_file = 'models/enwiki_dbow/doc2vec.bin'
path = os.path.join(wd, input_file)
d2v_wiki= Doc2Vec.load(path)

lemmatizer = WordNetLemmatizer()

In [67]:
######################
# DOC2VEC ALGORITHM
######################
output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

for i in range(0,len(doc_names)):
    # Get base doc
    print(i)
    input_file='data/all/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)

    # Set model to use
    model = d2v_wiki

    # Preprocess document
    tokens = preprocess(base_document)

    # Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
    tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))

    # Develop paragraph vector for base document
    base_vector = model.infer_vector(tokens)

    # Develop paragraph vector for comparison documents
    vectors = []
    for j, document in enumerate(comp_docs):
        
        tokens = preprocess(document)
        tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
        vector = model.infer_vector(tokens)
        vectors.append(vector)

    # Get cosine similarity score for each document compared to the base document
    scores = cosine_similarity([base_vector], vectors).flatten()

    # Find document with highest similarity score
    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j

    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]

    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
    # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

# Merge doc title to output
output2=output2.merge(doc_data, how='left', left_on='comp_doc', right_on='doc_names')
output2=output2[['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity', 'comp_doc', 'cosine_similarity', 'doc_titles']]

output_file = 'outputs/results_doc2vec_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

0
Most similar document to: POL-162.docx  =  POL-162.docx  with the score: 0.987658
1
Most similar document to: POL-169 v5_Clean.docx  =  POL-169 v5_Clean.docx  with the score: 0.9821239
2
Most similar document to: POL-319.docx  =  POL-319.docx  with the score: 0.97926104
3
Most similar document to: POL-399.docx  =  POL-399.docx  with the score: 0.99086344
4
Most similar document to: POL-409.docx  =  POL-409.docx  with the score: 0.97629493
5
Most similar document to: POL-879.docx  =  POL-879.docx  with the score: 0.9845359
6
Most similar document to: POL-891.docx  =  POL-891.docx  with the score: 0.9853437
7
Most similar document to: POL-928.docx  =  POL-928.docx  with the score: 0.9805898
8
Most similar document to: search.docx  =  search.docx  with the score: 0.93443716
9
Most similar document to: SOP-1464 v2.0.docx  =  SOP-1464 v2.0.docx  with the score: 0.98184025
10
Most similar document to: SOP-1465 v4.0.docx  =  SOP-1465 v4.0.docx  with the score: 0.98644644
11
Most similar doc

In [68]:
most_sim=output2[(output2['base_doc']=='search.docx') & (output2['comp_doc']!='search.docx')]
most_sim.sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,base_doc,most_sim_doc,most_sim_cosine_similarity,comp_doc,cosine_similarity,doc_titles
295,search.docx,search.docx,0.934437,Technical Writer JD.docx,0.367088,Technical Writer JD.docx
276,search.docx,search.docx,0.934437,POL-409.docx,0.340531,Global Market Research Policy
292,search.docx,search.docx,0.934437,SOP-3142.docx,0.31325,Astellas Primary Market Research
279,search.docx,search.docx,0.934437,POL-928.docx,0.313188,Astellas Social Contribution Policy
273,search.docx,search.docx,0.934437,POL-169 v5_Clean.docx,0.304908,Use of Externally Sourced Real World Data
303,search.docx,search.docx,0.934437,WPD-631 v2.0.docx,0.304184,Combining Data
298,search.docx,search.docx,0.934437,WPD-624 v4.0.docx,0.2977,Analytic Project Execution
299,search.docx,search.docx,0.934437,WPD-625 v3.0.docx,0.296819,Project Triage Team
278,search.docx,search.docx,0.934437,POL-891.docx,0.292455,Global Policy on Interactions with Patients an...
296,search.docx,search.docx,0.934437,WPD-622 v3.0.docx,0.283083,User Registration


# USE

In [10]:
######################
# USE MODEL
######################
input_file = 'models/universal-sentence-encoder_4'
path = os.path.join(wd, input_file)
use_model = hub.load(path)

In [8]:
######################
# USE ALGORITHM
######################
output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

for i in range(0,len(doc_names)):
    # Get base doc
    input_file='data/all/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)
    
    # Set model to use
    model = use_model

    # Get embeddings for base document
    base_embeddings = model([base_document])

    # Get embeddings for comp documents
    embeddings = model(comp_docs)

    scores = cosine_similarity(base_embeddings, embeddings).flatten()

    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j
        
    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]

    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
        # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

# Merge doc title to output
output2=output2.merge(doc_data, how='left', left_on='comp_doc', right_on='doc_names')
output2=output2[['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity', 'comp_doc', 'cosine_similarity', 'doc_titles']]

output_file = 'outputs/results_use_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

Most similar document to: POL-162.docx  =  POL-162.docx  with the score: 1.0000001
Most similar document to: POL-169 v5_Clean.docx  =  POL-169 v5_Clean.docx  with the score: 1.0
Most similar document to: POL-319.docx  =  POL-319.docx  with the score: 0.9999999
Most similar document to: POL-399.docx  =  POL-399.docx  with the score: 1.0000001
Most similar document to: POL-409.docx  =  POL-409.docx  with the score: 1.0
Most similar document to: POL-879.docx  =  POL-879.docx  with the score: 1.0
Most similar document to: POL-891.docx  =  POL-891.docx  with the score: 1.0000001
Most similar document to: POL-928.docx  =  POL-928.docx  with the score: 1.0
Most similar document to: search.docx  =  search.docx  with the score: 0.9999999
Most similar document to: SOP-1464 v2.0.docx  =  SOP-1464 v2.0.docx  with the score: 1.0000002
Most similar document to: SOP-1465 v4.0.docx  =  SOP-1465 v4.0.docx  with the score: 1.0
Most similar document to: SOP-1466 v3.0.docx  =  SOP-1466 v3.0.docx  with the

In [9]:
most_sim=output2[(output2['base_doc']=='search.docx') & (output2['comp_doc']!='search.docx')]
most_sim.sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,base_doc,most_sim_doc,most_sim_cosine_similarity,comp_doc,cosine_similarity,doc_titles
276,search.docx,search.docx,1.0,POL-409.docx,0.293034,Global Market Research Policy
292,search.docx,search.docx,1.0,SOP-3142.docx,0.275459,Astellas Primary Market Research
287,search.docx,search.docx,1.0,SOP-1877.docx,0.126614,Patient Access Initiatives
289,search.docx,search.docx,1.0,SOP-1943.docx,0.113408,Medical Affairs Sponsorships for Research
303,search.docx,search.docx,1.0,WPD-631 v2.0.docx,0.105482,Combining Data
274,search.docx,search.docx,1.0,POL-319.docx,0.102796,Sampling Policy
293,search.docx,search.docx,1.0,SOP-3176.docx,0.099131,Free Text Monitoring
282,search.docx,search.docx,1.0,SOP-1465 v4.0.docx,0.089543,Analytic Project Management
299,search.docx,search.docx,1.0,WPD-625 v3.0.docx,0.089526,Project Triage Team
298,search.docx,search.docx,1.0,WPD-624 v4.0.docx,0.086216,Analytic Project Execution


# BERT

In [70]:
######################
# BERT MODEL
######################
input_file = 'models/all-MiniLM-L6-v2'
path = os.path.join(wd, input_file)
bert_model = SentenceTransformer(path)

In [71]:
######################
# BERT ALGORITHM
######################
# Set model to use
model = bert_model

output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

# Get embeddings for all documents and append to vector
vectors = []
for i, doc in enumerate(comp_docs):

    sentences = sent_tokenize(doc)
    embeddings_sentences = model.encode(sentences)
    embeddings = np.mean(np.array(embeddings_sentences), axis=0)

    vectors.append(embeddings)

    print("making vector at index:", i)

for i in range(0,len(doc_names)):
    # Get base doc
    input_file='data/all/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)
    
    # Although it is not explicitly stated in the official document of sentence transformer, the original BERT is meant for a shorter sentence. We will feed the model by sentences instead of the whole documents.
    sentences = sent_tokenize(base_document)
    base_embeddings_sentences = model.encode(sentences)
    base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
    scores = cosine_similarity([base_embeddings], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j
        
    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]
    
    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
    # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

# Merge doc title to output
output2=output2.merge(doc_data, how='left', left_on='comp_doc', right_on='doc_names')
output2=output2[['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity', 'comp_doc', 'cosine_similarity', 'doc_titles']]

output_file = 'outputs/results_bert_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

making vector at index: 0
making vector at index: 1
making vector at index: 2
making vector at index: 3
making vector at index: 4
making vector at index: 5
making vector at index: 6
making vector at index: 7
making vector at index: 8
making vector at index: 9
making vector at index: 10
making vector at index: 11
making vector at index: 12
making vector at index: 13
making vector at index: 14
making vector at index: 15
making vector at index: 16
making vector at index: 17
making vector at index: 18
making vector at index: 19
making vector at index: 20
making vector at index: 21
making vector at index: 22
making vector at index: 23
making vector at index: 24
making vector at index: 25
making vector at index: 26
making vector at index: 27
making vector at index: 28
making vector at index: 29
making vector at index: 30
making vector at index: 31
making vector at index: 32
making vector at index: 33
Most similar document to: POL-162.docx  =  POL-162.docx  with the score: 1.0
Most similar do

In [72]:
most_sim=output2[(output2['base_doc']=='search.docx') & (output2['comp_doc']!='search.docx')]
most_sim.sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,base_doc,most_sim_doc,most_sim_cosine_similarity,comp_doc,cosine_similarity,doc_titles
276,search.docx,search.docx,1.0,POL-409.docx,0.699519,Global Market Research Policy
292,search.docx,search.docx,1.0,SOP-3142.docx,0.60102,Astellas Primary Market Research
275,search.docx,search.docx,1.0,POL-399.docx,0.46775,Global Policy on Medical Affairs and Commercia...
272,search.docx,search.docx,1.0,POL-162.docx,0.45759,Communications Policy for Established Markets
295,search.docx,search.docx,1.0,Technical Writer JD.docx,0.441093,Technical Writer JD.docx
288,search.docx,search.docx,1.0,SOP-1921.docx,0.397793,Social Media Management for Established Markets
289,search.docx,search.docx,1.0,SOP-1943.docx,0.3923,Medical Affairs Sponsorships for Research
279,search.docx,search.docx,1.0,POL-928.docx,0.352073,Astellas Social Contribution Policy
287,search.docx,search.docx,1.0,SOP-1877.docx,0.33026,Patient Access Initiatives
293,search.docx,search.docx,1.0,SOP-3176.docx,0.321687,Free Text Monitoring


# How similiar are pieces of documents?

In [None]:
doc1=doc_parsed[doc_parsed['doc_names'].isin(['POL-409.docx'])]
doc2=doc_parsed[doc_parsed['doc_names'].isin(['SOP-3142.docx'])]

base_document=doc1[doc1['headings']=='PURPOSE '].iloc[0]['text']
comp_document=doc2[doc2['headings']=='PURPOSE  '].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

base_document=doc1[doc1['headings']=='SCOPE'].iloc[0]['text']
comp_document=doc2[doc2['headings']=='SCOPE'].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

base_document=doc1[doc1['headings']=='TABLE OF CONTENTS '].iloc[0]['text']
comp_document=doc2[doc2['headings']=='TABLE OF CONTENTS '].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

base_document=doc1[doc1['headings']=='DEFINITIONS'].iloc[0]['text']
comp_document=doc2[doc2['headings']=='DEFINITIONS'].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

base_document=doc1[doc1['headings']=='REFERENCES - None'].iloc[0]['text']
comp_document=doc2[doc2['headings']=='REFERENCES'].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

[0.5443448]
[0.43510073]
[0.76419973]
[0.8513243]
[0.01937321]


# How similiar are definitons?

In [12]:
model=use_model

doc1=doc_parsed[doc_parsed['doc_names'].isin(['POL-409.docx'])]
doc2=doc_parsed[doc_parsed['doc_names'].isin(['SOP-3142.docx'])]

base_document=doc1[doc1['headings']=='DEFINITIONS'].iloc[0]['text']
comp_document=doc2[doc2['headings']=='DEFINITIONS'].iloc[0]['text']

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

[0.8513243]


In [33]:
# Document 1
input_file='data/all/'+ 'POL-409.docx'
path1 = os.path.join(wd, input_file)
doc1 = docx.Document(path1)
table = doc1.tables[0]
data = []

keys = None
for i, row in enumerate(table.rows):
    text = (cell.text for cell in row.cells)

    if i == 0:
        keys = tuple(text)
        continue
    row_data = dict(zip(keys, text))
    data.append(row_data)
    # print (data)

df1 = pd.DataFrame(data)

# Document 2
input_file='data/all/'+ 'SOP-3142.docx'
path2 = os.path.join(wd, input_file)
doc2 = docx.Document(path2)
table = doc2.tables[0]
data = []

keys = None
for i, row in enumerate(table.rows):
    text = (cell.text for cell in row.cells)

    if i == 0:
        keys = tuple(text)
        continue
    row_data = dict(zip(keys, text))
    data.append(row_data)
    # print (data)

df2 = pd.DataFrame(data)

# MDL Standardization
df2=df2.replace('Market Research (MR)', 'Market Research')

# Merge common terms
df3=df1.merge(df2, on='Term', how='inner')

In [75]:
# term='Market Research'
# term='Primary Market Research'
term='Secondary Market Research'

# Get definition
base_document=df3[df3['Term']==term].iloc[0]['Definition_x']
comp_document=df3[df3['Term']==term].iloc[0]['Definition_y']

# Clean definitions
base_document=base_document.replace("\n", " ")
base_document=base_document.replace("\t", " ")

comp_document=comp_document.replace("\n", " ")
comp_document=comp_document.replace("\t", " ")

# Get embeddings for base document
base_embeddings = model([base_document])

# Get embeddings for comp documents
embeddings = model([comp_document])

scores = cosine_similarity(base_embeddings, embeddings).flatten()
print(scores)

[0.99999994]


In [72]:
import difflib as dl
    
# initiate the Differ object
d = dl.Differ()

# calculate the difference between the two texts
diff = d.compare(base_document.split(), comp_document.split())

# output the result
print ('\n'.join(diff))

  Market
  Research
  involving
  data
  already
  collected
  and
  available
  in
  the
  public
  and/or
  private
  domain
  (i.e.,
  epidemiology,
  government
  statistics,
  social
  media).
  With
  secondary
  market
  research,
  there
  is
  no
  interaction
  with
  individuals
  or
  organizations
  to
  gather
  information.


In [73]:
base_document

'Market Research involving data already collected and available in the public and/or private domain (i.e., epidemiology, government statistics, social media).  With secondary market research, there is no interaction with individuals or organizations to gather information.'

In [74]:
comp_document

'Market Research involving data already collected and available in the public and/or private domain (i.e., epidemiology, government statistics, social media). With secondary market research, there is no interaction with individuals or organizations to gather information.'