Install libraries 

In [1]:
!pip install transformers
!pip install sentence-transformers
!pip install numpy
!pip install torch
!pip install nltk
!pip install pandas





Import libraries 

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
import json
import pandas as pd
import nltk
nltk.download('punkt')  # Download the necessary tokenizer data if not already downloaded
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rashi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Download and load model 

In [3]:
# Load BERT model and tokenizer
model_name = "bert-large-uncased"  # You can use other BERT variants
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Semantic word search function 

In [4]:
def semantic_word_search(query_word, word_list, threshold=0.80):
    # Encode the query word
    query_input = tokenizer(query_word, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        query_output = model(**query_input)

    # Extract the embeddings for the query word
    query_embedding = query_output.last_hidden_state.mean(dim=1).squeeze()

    # Encode the list of words
    word_inputs = tokenizer(word_list, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        word_outputs = model(**word_inputs)

    # Extract the embeddings for the list of words
    word_embeddings = word_outputs.last_hidden_state.mean(dim=1)

    # Calculate cosine similarity between query and list of words
    cosine_scores = torch.nn.functional.cosine_similarity(query_embedding, word_embeddings, dim=1)

    # Convert cosine similarity scores to semantic scores and to normal (Python float) values
    semantic_scores = [(score.item() + 1) / 2 for score in cosine_scores]  # Normalize to [0, 1] range and convert to float

    # Create a list to store unique words and their corresponding semantic scores as pairs
    unique_words_and_scores = []

    # Iterate through the words and semantic scores
    for word, score in zip(word_list, semantic_scores):
        if word not in (pair[0] for pair in unique_words_and_scores):
            if score > threshold:
                unique_words_and_scores.append((word, score))
    
    # Separate the unique words and their scores into separate lists
    unique_words, unique_scores = zip(*unique_words_and_scores)

    return list(unique_words), list(unique_scores)


# Change the json_input, file_name and file_path accordingly 

Extract data from JSON Input

In [9]:
# Example JSON input
json_input = '{"reference_number": "CUAAFA201807032023AC", "query": "CCTV"}'
query_data = json.loads(json_input)  # Parse the JSON input
query_word = query_data["query"].lower()  # Extract the query word from the JSON
reference_number = query_data["reference_number"]  # Extract the reference number from the JSON

Extract required  data from CSV File

In [10]:
file_path = "Contracts_Dataset_With_Extract.csv"  # Replace with the path to your CSV file

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, encoding='utf-8')

# Filter the DataFrame based on the reference_number
filtered_data = df[df['Reference Number'] == reference_number]

# Check if any data with the given reference_number exists
if not filtered_data.empty:
    # Define the columns to concatenate
    columns_to_concat = ['Contract Title', 'Description', 'Tenders Content']  # Add your column names here

    # Initialize an empty list to store non-empty column values
    non_empty_values = []

    # Iterate through the columns and append non-empty values to the list
    for column in columns_to_concat:
        column_values = filtered_data[column].values[0]
        if not pd.isna(column_values) and column_values.strip():  # Check for non-null and non-empty values
            non_empty_values.append(column_values)

    if non_empty_values:
        # Concatenate non-empty column values into a single string
        text_data = " ".join(non_empty_values)
        text_data = text_data.lower()

        # Tokenize the concatenated text data
        tokenized_words = word_tokenize(text_data)

JSON Output

In [11]:
words_list, semantic_scores  = semantic_word_search(query_word, tokenized_words)

# Print the semantic scores for each word along with the word itself
for word, score in zip(words_list, semantic_scores):
    print(f"Word: {word}, Semantic Score: {score:.4f}")

# Create a JSON object with two arrays: words and semantic scores
output_json = {
    "words": words_list,
    "semantic_scores": semantic_scores  # No need to convert tolist()
}

# Serialize the JSON object to a string
output_json_str = json.dumps(output_json, indent=2)

# Print the JSON string
print(output_json_str)

Word: arts, Semantic Score: 0.8242
Word: culture, Semantic Score: 0.8428
Word: current, Semantic Score: 0.8064
Word: model, Semantic Score: 0.8067
Word: stakeholder, Semantic Score: 0.8173
{
  "words": [
    "arts",
    "culture",
    "current",
    "model",
    "stakeholder"
  ],
  "semantic_scores": [
    0.8242495059967041,
    0.8428006768226624,
    0.8063924014568329,
    0.8067045211791992,
    0.8172733187675476
  ]
}
