In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from gensim.models.doc2vec import Doc2Vec

import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import docx2txt
from docx import Document

from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, paired_cosine_distances
from sentence_transformers import SentenceTransformer

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
######################
# PARAMS
######################
wd = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\SOP\DocSim'

In [85]:
######################
# SEARCH QUERY
######################
query = 'job descriptions'

document = Document()
document.add_paragraph(query)

output_file = 'data/AIA/search.docx'
path = os.path.join(wd, output_file)
document.save(path)

# Doc2Vec

In [46]:
######################
# DOC2VEC MODEL
######################
input_file = 'models/enwiki_dbow/doc2vec.bin'
path = os.path.join(wd, input_file)
d2v_wiki= Doc2Vec.load(path)

lemmatizer = WordNetLemmatizer()

In [47]:
######################
# FUNCTIONS
######################

def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lammetize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words

def process_doc2vec_similarity(model):

    # Both pretrained models are publicly available at public repo of jhlau.
    # URL: https://github.com/jhlau/doc2vec

    filename = './models/apnews_dbow/doc2vec.bin'
    filename = './models/enwiki_dbow/doc2vec.bin' 

    model= Doc2Vec.load(filename)

    tokens = preprocess(base_document)

    # Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
    tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))

    base_vector = model.infer_vector(tokens)

    vectors = []
    for i, document in enumerate(documents):

        tokens = preprocess(document)
        tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
        vector = model.infer_vector(tokens)
        vectors.append(vector)

        print("making vector at index:", i)

    scores = cosine_similarity([base_vector], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = i

    most_similar_document = documents[highest_score_index]
    print("Most similar document by Doc2vec with the score:", most_similar_document, highest_score)
    return 

In [48]:
######################
# DATA
######################
# doc_names = ['POL-169 v5_Clean','SOP-1464 v2.0', 'SOP-1465 v4.0', 'SOP-1466 v3.0', 'Technical Writer JD']
path = os.path.join(wd, 'data/AIA/')
doc_names = os.listdir(path)

comp_docs=[]
for name in doc_names:
    input_file='data/AIA/'+ name
    path = os.path.join(wd, input_file)
    doc = docx2txt.process(path)
    comp_docs.append(doc)

In [86]:
######################
# DOC2VEC ALGORITHM
######################
output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

for i in range(0,len(doc_names)):
    # Get base doc
    print(i)
    input_file='data/AIA/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)

    # Set model to use
    model = d2v_wiki

    # Preprocess document
    tokens = preprocess(base_document)

    # Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
    tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))

    # Develop paragraph vector for base document
    base_vector = model.infer_vector(tokens)

    # Develop paragraph vector for comparison documents
    vectors = []
    for j, document in enumerate(comp_docs):
        
        tokens = preprocess(document)
        tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
        vector = model.infer_vector(tokens)
        vectors.append(vector)

    # Get cosine similarity score for each document compared to the base document
    scores = cosine_similarity([base_vector], vectors).flatten()

    # Find document with highest similarity score
    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j

    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]

    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
    # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

output_file = 'outputs/results_doc2vec_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

0
Most similar document to: POL-169 v5_Clean.docx  =  POL-169 v5_Clean.docx  with the score: 0.98000354
1
Most similar document to: search.docx  =  search.docx  with the score: 0.8031055
2
Most similar document to: SOP-1464 v2.0.docx  =  SOP-1464 v2.0.docx  with the score: 0.98572546
3
Most similar document to: SOP-1465 v4.0.docx  =  SOP-1465 v4.0.docx  with the score: 0.9839408
4
Most similar document to: SOP-1466 v3.0.docx  =  SOP-1466 v3.0.docx  with the score: 0.98164463
5
Most similar document to: Technical Writer JD.docx  =  Technical Writer JD.docx  with the score: 0.9764682
6
Most similar document to: WPD-622 v3.0.docx  =  WPD-622 v3.0.docx  with the score: 0.982273
7
Most similar document to: WPD-623 v2.0.docx  =  WPD-623 v2.0.docx  with the score: 0.9693717
8
Most similar document to: WPD-624 v4.0.docx  =  WPD-624 v4.0.docx  with the score: 0.9773649
9
Most similar document to: WPD-625 v3.0.docx  =  WPD-625 v3.0.docx  with the score: 0.98112965
10
Most similar document to: WP

In [83]:
most_sim=output2[(output2['base_doc']=='search.docx') & (output2['comp_doc']!='search.docx')]
most_sim.sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,base_doc,most_sim_doc,most_sim_cosine_similarity,comp_doc,cosine_similarity
21,search.docx,search.docx,0.954555,Technical Writer JD.docx,0.506337
24,search.docx,search.docx,0.954555,WPD-624 v4.0.docx,0.40201
25,search.docx,search.docx,0.954555,WPD-625 v3.0.docx,0.389121
20,search.docx,search.docx,0.954555,SOP-1466 v3.0.docx,0.383464
31,search.docx,search.docx,0.954555,WPD-634 v2.0.docx,0.376299
30,search.docx,search.docx,0.954555,WPD-632 v2.0.docx,0.37566
27,search.docx,search.docx,0.954555,WPD-628 v2.0.docx,0.375575
22,search.docx,search.docx,0.954555,WPD-622 v3.0.docx,0.373572
29,search.docx,search.docx,0.954555,WPD-631 v2.0.docx,0.37206
18,search.docx,search.docx,0.954555,SOP-1464 v2.0.docx,0.366586


# USE

In [8]:
######################
# USE MODEL
######################
input_file = 'models/universal-sentence-encoder_4'
path = os.path.join(wd, input_file)
use_model = hub.load(path)

In [9]:
######################
# FUNCTIONS
######################
def process_use_similarity(model):

    base_embeddings = model([base_document])

    embeddings = model(documents)

    scores = cosine_similarity(base_embeddings, embeddings).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = i

    most_similar_document = documents[highest_score_index]
    print("Most similar document by USE with the score:", most_similar_document, highest_score)


In [10]:
######################
# DATA
######################
# doc_names = ['POL-169 v5_Clean','SOP-1464 v2.0', 'SOP-1465 v4.0', 'SOP-1466 v3.0', 'Technical Writer JD']
path = os.path.join(wd, 'data/AIA/')
doc_names = os.listdir(path)

comp_docs=[]
for name in doc_names:
    input_file='data/AIA/'+ name
    path = os.path.join(wd, input_file)
    doc = docx2txt.process(path)
    comp_docs.append(doc)

In [11]:
######################
# USE ALGORITHM
######################
output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

for i in range(0,len(doc_names)):
    # Get base doc
    input_file='data/AIA/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)
    
    # Set model to use
    model = use_model

    # Get embeddings for base document
    base_embeddings = model([base_document])

    # Get embeddings for comp documents
    embeddings = model(comp_docs)

    scores = cosine_similarity(base_embeddings, embeddings).flatten()

    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j
        
    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]

    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
        # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

output_file = 'outputs/results_use_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

Most similar document to: POL-169 v5_Clean.docx  =  POL-169 v5_Clean.docx  with the score: 1.0
Most similar document to: search.docx  =  search.docx  with the score: 1.0
Most similar document to: SOP-1464 v2.0.docx  =  SOP-1464 v2.0.docx  with the score: 1.0000002
Most similar document to: SOP-1465 v4.0.docx  =  SOP-1465 v4.0.docx  with the score: 1.0
Most similar document to: SOP-1466 v3.0.docx  =  SOP-1466 v3.0.docx  with the score: 0.99999994
Most similar document to: Technical Writer JD.docx  =  Technical Writer JD.docx  with the score: 1.0
Most similar document to: WPD-622 v3.0.docx  =  WPD-622 v3.0.docx  with the score: 1.0
Most similar document to: WPD-623 v2.0.docx  =  WPD-623 v2.0.docx  with the score: 1.0000002
Most similar document to: WPD-624 v4.0.docx  =  WPD-624 v4.0.docx  with the score: 0.9999999
Most similar document to: WPD-625 v3.0.docx  =  WPD-625 v3.0.docx  with the score: 1.0000001
Most similar document to: WPD-626 v2.0.docx  =  WPD-626 v2.0.docx  with the score: 

# BERT

In [43]:
######################
# BERT MODEL
######################
input_file = 'models/all-MiniLM-L6-v2'
path = os.path.join(wd, input_file)
bert_model = SentenceTransformer(path)

In [4]:
######################
# FUNCTIONS
######################
def process_bert_similarity():
    # This will download and load the pretrained model offered by UKPLab.
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Although it is not explicitly stated in the official document of sentence transformer, the original BERT is meant for a shorter sentence. We will feed the model by sentences instead of the whole documents.
    sentences = sent_tokenize(base_document)
    base_embeddings_sentences = model.encode(sentences)
    base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)

    vectors = []
    for i, document in enumerate(documents):

        sentences = sent_tokenize(document)
        embeddings_sentences = model.encode(sentences)
        embeddings = np.mean(np.array(embeddings_sentences), axis=0)

        vectors.append(embeddings)

        print("making vector at index:", i)

    scores = cosine_similarity([base_embeddings], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = i

    most_similar_document = documents[highest_score_index]
    print("Most similar document by BERT with the score:", most_similar_document, highest_score)

In [44]:
######################
# DATA
######################
# doc_names = ['POL-169 v5_Clean','SOP-1464 v2.0', 'SOP-1465 v4.0', 'SOP-1466 v3.0', 'Technical Writer JD']
path = os.path.join(wd, 'data/AIA/')
doc_names = os.listdir(path)
doc_names = doc_names[1:6]

comp_docs=[]
for name in doc_names:
    input_file='data/AIA/'+ name
    path = os.path.join(wd, input_file)
    doc = docx2txt.process(path)
    comp_docs.append(doc)

In [45]:
######################
# BERT ALGORITHM
######################
# Set model to use
model = bert_model

output = pd.DataFrame()
base_docs_arr = []
comp_docs_arr = []
cos_sim_arr=[]
most_sim_arr=[]
most_score_arr=[]

# Get embeddings for all documents and append to vector
vectors = []
for i, doc in enumerate(comp_docs):

    sentences = sent_tokenize(doc)
    embeddings_sentences = model.encode(sentences)
    embeddings = np.mean(np.array(embeddings_sentences), axis=0)

    vectors.append(embeddings)

    print("making vector at index:", i)

for i in range(0,len(doc_names)):
    # Get base doc
    input_file='data/AIA/'+ doc_names[i]
    path = os.path.join(wd, input_file)
    base_document = docx2txt.process(path)
    
    # Although it is not explicitly stated in the official document of sentence transformer, the original BERT is meant for a shorter sentence. We will feed the model by sentences instead of the whole documents.
    sentences = sent_tokenize(base_document)
    base_embeddings_sentences = model.encode(sentences)
    base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
    scores = cosine_similarity([base_embeddings], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for j, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = j
        
    most_similar_document = comp_docs[highest_score_index]
    most_similar_doc_name = doc_names[highest_score_index]
    
    print("Most similar document to:",  doc_names[i] ," = ", most_similar_doc_name, " with the score:", highest_score)
    
    # Append to results
    base_docs_arr.append(doc_names[i])
    comp_docs_arr.append(doc_names)
    cos_sim_arr.append(scores)
    most_sim_arr.append(most_similar_doc_name)
    most_score_arr.append(highest_score)
    
output = pd.DataFrame({'base_doc': base_docs_arr, 'comp_doc': comp_docs_arr, 'cosine_similarity': cos_sim_arr, 'most_sim_doc': most_sim_arr, 'most_sim_cosine_similarity': most_score_arr})
output2 = output.set_index(['base_doc', 'most_sim_doc', 'most_sim_cosine_similarity']).apply(pd.Series.explode).reset_index()

output_file = 'outputs/results_bert_model.csv'
path = os.path.join(wd, output_file)
output2.to_csv(path)

making vector at index: 0
making vector at index: 1
making vector at index: 2
making vector at index: 3
making vector at index: 4
Most similar document to: search.docx  =  search.docx  with the score: 1.0000001
Most similar document to: SOP-1464 v2.0.docx  =  SOP-1464 v2.0.docx  with the score: 0.9999999
Most similar document to: SOP-1465 v4.0.docx  =  SOP-1465 v4.0.docx  with the score: 1.0000001
Most similar document to: SOP-1466 v3.0.docx  =  SOP-1466 v3.0.docx  with the score: 1.0
Most similar document to: Technical Writer JD.docx  =  Technical Writer JD.docx  with the score: 0.9999999
