Install libraries 

In [1]:
import pandas as pd
import json
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

Import libraries 

In [2]:
def calculate_document_scores(query_word, dataset_path, num_topics=10):
    # Load your dataset
    df = pd.read_csv(dataset_path, encoding='utf-8')

    # Initialize result lists for reference numbers and similarity scores
    reference_numbers = []
    similarity_scores = []

    for index, row in df.iterrows():
        columns_to_concat = ['Contract Title', 'Description', 'Tenders Content']  # Add your column names here
        non_empty_values = []

        for column in columns_to_concat:
            column_values = row[column]
            if not pd.isna(column_values) and column_values.strip():
                non_empty_values.append(column_values)

        if non_empty_values:
            text_data = " ".join(non_empty_values)

            # Tokenize the concatenated text data
            tokenized_words = word_tokenize(text_data)

            # Tokenize and preprocess the text data
            document = " ".join(tokenized_words)

            # Create a CountVectorizer
            count_vectorizer = CountVectorizer()
            count_data = count_vectorizer.fit_transform([document])

            # Create an LDA model
            lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)  # Adjust num_topics as needed

            # Fit the LDA model to the data
            lda.fit(count_data)

            # Get the topics and their word distributions
            topics = lda.components_

            # Calculate semantic scores based on the topics
            topic_scores = lda.transform(count_data)

            # Select the topic with the highest score for the query word
            word_topic = topic_scores[0].argmax()

            # Calculate semantic scores for the query word based on its assigned topic
            semantic_score = topic_scores[0][word_topic]

            reference_numbers.append(row['Reference Number'])
            similarity_scores.append(semantic_score)

    # Initialize an output list to hold dictionaries for each reference number
    output_list = []

    # Iterate through reference numbers and associated values
    for ref_num, sim_score in zip(reference_numbers, similarity_scores):
        ref_values = {
            "Client Agency": str(df.loc[df['Reference Number'] == ref_num]['Client Agency'].values[0]),
            "Contract Title": str(df.loc[df['Reference Number'] == ref_num]['Contract Title'].values[0]),
            "Procurement Method": str(df.loc[df['Reference Number'] == ref_num]['Procurement Method'].values[0]),
            "Reference Number": ref_num,
            "Revised Contract Value": float(df.loc[df['Reference Number'] == ref_num]['Revised Contract Value'].values[0]),
            "Supplier Name": str(df.loc[df['Reference Number'] == ref_num]['Supplier Name'].values[0]),
            "Tender Closing Date": str(df.loc[df['Reference Number'] == ref_num]['Tender Closing Date'].values[0]),
            "Type of Work": str(df.loc[df['Reference Number'] == ref_num]['Type of Work'].values[0]),
            "UNSPSC Code": int(df.loc[df['Reference Number'] == ref_num]['UNSPSC Code'].values[0]),
            "UNSPSC Title": str(df.loc[df['Reference Number'] == ref_num]['UNSPSC Title'].values[0]),
            "Similarity Score": float(sim_score),  # Convert to float for JSON serialization
        }
        output_list.append(ref_values)

    return json.dumps(output_list, indent=2)

In [3]:
# Example usage
query_word = "CCTV"
dataset_path = "extract_try.csv"  # Replace with your dataset path
semantic_scores_json = calculate_document_scores(query_word, dataset_path)
print(semantic_scores_json)

[
  {
    "Client Agency": "Programmed Facility Management",
    "Contract Title": "PB Maintenance Security Screen Upgrade",
    "Procurement Method": "Open Advertisement",
    "Reference Number": "HOU136",
    "Revised Contract Value": 1.0,
    "Supplier Name": "Profix Australia",
    "Tender Closing Date": "4/12/2023 0:00",
    "Type of Work": "Works",
    "UNSPSC Code": 72100000,
    "UNSPSC Title": "Building and facility maintenance and repair services",
    "Similarity Score": 0.9997711088177327
  },
  {
    "Client Agency": "Department of Education",
    "Contract Title": "Department of Education - Geraldton Residential College - CCTV Upgrade",
    "Procurement Method": "Open Advertisement",
    "Reference Number": "FINW0737022",
    "Revised Contract Value": 115583.0,
    "Supplier Name": "Incite Security Pty Ltd",
    "Tender Closing Date": "3/10/2023 0:00",
    "Type of Work": "Works",
    "UNSPSC Code": 92121700,
    "UNSPSC Title": "Security systems services",
    "Similarit