In [2]:
import spacy
import os
import pandas as pd
import numpy as np
import warnings
from collections import defaultdict

def Find_Semantically_Similar_Words(words, similarity_threshold):
    
    # Get the current directory
    current_directory = os.getcwd()
    # Directory for extracted contents
    extracted_directory = os.path.join(current_directory, 'Extracted_TenderFiles_1')
    
    dataset = pd.read_excel('Contracts_Dataset.xlsx', dtype=str)
    semantically_similar_list = {}
    
    nlp = spacy.load("en_core_web_sm")

    
    for word in words:        
        similar_words_by_reference_number = {}
        
        # Loop through the DataFrame one row at a time
        for index, row in dataset.iterrows():
            reference_number = row['Reference Number']
            
            # Get Contract Title, Description, UNSPSC Title, Supplier Name
            one_row_doc = row['Contract Title'] + ' ' + row['Description'] + ' ' + row['UNSPSC Title'] + ' ' + row['Supplier Name']
            
            document = nlp(one_row_doc)
            
            similar_words = set()
            for token in document:
                # Calculate the similarity score between the target word and each token in the document
                similarity_score = token.similarity(nlp(word))


                if similarity_score > similarity_threshold:
                    similar_words.add(token.text)
            
            similar_words_by_reference_number[reference_number] = similar_words
    
    semantically_similar_list[word] = similar_words_by_reference_number
        
    return semantically_similar_list


In [4]:
# You can adjust the threshold as needed to filter out less similar words
semantically_similar_list = Find_Semantically_Similar_Words(['CCTV'], 0.8)
print(semantically_similar_list)

  similarity_score = token.similarity(nlp(word))


{'CCTV': {'ARA201901891': set(), '20221AGWA': set(), 'FIN873DLGSCAG': set(), '2020153WAM': set(), 'CUAHRS202117042023AC': set(), 'RFQ13042023ACT': set(), 'E03052023ACT': set(), 'RFQ06122022ACT2': set(), 'CUAAFA201807032023AC': set(), 'CUAPCS201829112022AC': set(), 'WR05042022ACT': set(), 'E19012023ACT': set(), 'E10022023ACT': set(), 'E14122022ACT': set(), 'E29112022ACT': set(), 'RFQ08042022ACT': set(), 'E11102022ACT': set(), 'RFQ06042022PTT': set(), 'RFQ06072022ACT': set(), 'E12072022ACT': set(), 'E05072022ACT': set(), 'RFQ05042022PTT': set(), 'CUATPS201907072022AC': set(), 'E10042022PTT': set(), 'CUATPS201914062022PT': set(), 'CUATPS201908042022PT': set(), '07012022PTT': set(), '02022021DLGSCPTT': set(), '04022022CUATP2019': set(), '18122020DLGSCPTT': set(), '02122020DLGSCPTT': set(), '204340DLGSC': set(), 'PTT05062019': set(), '3092020DLGSCPTT': set(), 'E20402DLGSCPTT': set(), 'E20400DLGSCPTT': set(), 'E2019192449DLGSCPTT': set(), 'E2019CA1958DLGSCPTT': set(), 'INF4CA20199': set(), '