In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import json

In [2]:
# Load the Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
def calculate_document_similarity(query_word, dataset_path):
    # Read your dataset
    df = pd.read_csv(dataset_path, encoding='utf-8')

    # Initialize result lists for reference numbers and similarity scores
    reference_numbers = []
    similarity_scores = []

    for index, row in df.iterrows():
        columns_to_concatenate = ['Contract Title', 'Description', 'Tenders Content']  # Add your column names here
        non_empty_values = []

        for column in columns_to_concatenate:
            column_values = row[column]
            if not pd.isna(column_values) and column_values.strip():
                non_empty_values.append(column_values)

        if non_empty_values:
            text_data = " ".join(non_empty_values)

            # Calculate semantic similarity between the text_data and the query_word
            query_embedding = model.encode(query_word, convert_to_tensor=True)
            text_embeddings = model.encode(text_data, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(query_embedding, text_embeddings).item()

            reference_numbers.append(row['Reference Number'])
            similarity_scores.append(cosine_scores)

    # Initialize an output list to hold dictionaries for each reference number
    output_list = []

    # Iterate through reference numbers and associated values
    for ref_num, sim_score in zip(reference_numbers, similarity_scores):
        ref_values = {
            "Client Agency": str(df.loc[df['Reference Number'] == ref_num]['Client Agency'].values[0]),
            "Contract Title": str(df.loc[df['Reference Number'] == ref_num]['Contract Title'].values[0]),
            "Procurement Method": str(df.loc[df['Reference Number'] == ref_num]['Procurement Method'].values[0]),
            "Reference Number": ref_num,
            "Revised Contract Value": float(df.loc[df['Reference Number'] == ref_num]['Revised Contract Value'].values[0]),
            "Supplier Name": str(df.loc[df['Reference Number'] == ref_num]['Supplier Name'].values[0]),
            "Tender Closing Date": str(df.loc[df['Reference Number'] == ref_num]['Tender Closing Date'].values[0]),
            "Type of Work": str(df.loc[df['Reference Number'] == ref_num]['Type of Work'].values[0]),
            "UNSPSC Code": int(df.loc[df['Reference Number'] == ref_num]['UNSPSC Code'].values[0]),
            "UNSPSC Title": str(df.loc[df['Reference Number'] == ref_num]['UNSPSC Title'].values[0]),
            "Similarity Score": float(sim_score),  # Convert to float for JSON serialization
        }
        output_list.append(ref_values)

    return json.dumps(output_list, indent=2)

In [4]:
# Example usage
query_word = "CCTV"
dataset_path = "extract_try.csv"  # Replace with your dataset path
semantic_scores_json = calculate_document_similarity(query_word, dataset_path)
print(semantic_scores_json)

[
  {
    "Client Agency": "Programmed Facility Management",
    "Contract Title": "PB Maintenance Security Screen Upgrade",
    "Procurement Method": "Open Advertisement",
    "Reference Number": "HOU136",
    "Revised Contract Value": 1.0,
    "Supplier Name": "Profix Australia",
    "Tender Closing Date": "4/12/2023 0:00",
    "Type of Work": "Works",
    "UNSPSC Code": 72100000,
    "UNSPSC Title": "Building and facility maintenance and repair services",
    "Similarity Score": 0.19007892906665802
  },
  {
    "Client Agency": "Department of Education",
    "Contract Title": "Department of Education - Geraldton Residential College - CCTV Upgrade",
    "Procurement Method": "Open Advertisement",
    "Reference Number": "FINW0737022",
    "Revised Contract Value": 115583.0,
    "Supplier Name": "Incite Security Pty Ltd",
    "Tender Closing Date": "3/10/2023 0:00",
    "Type of Work": "Works",
    "UNSPSC Code": 92121700,
    "UNSPSC Title": "Security systems services",
    "Similari