In [1]:
from rich.console import Console
from rich.traceback import install

# install(show_locals=True)
install()

console = Console()

In [2]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

from dataframe_loader import DataFrameLoader

from csv_loader import CSVLoader

In [3]:
FILE_DIR = "../../Dataset/Contracts_Dataset_With_Extract.csv"
# FILE_DIR = "../../Dataset/Tenders WA UTF8.csv"

In [4]:
# Load the data
df = pd.read_csv(FILE_DIR, index_col=False)

In [5]:
df = df.fillna("")

In [6]:
df.head()

Unnamed: 0,Reference Number,Client Agency,Type of Work,Contract Title,Description,Tender Closing Date,UNSPSC Code,UNSPSC Title,Procurement Method,Revised Contract Value,Supplier Name,Tenders Content
0,ARA201901891,Animal Resources Centre,Goods and Services,Supply and Delivery of Aspen Bedding Material,The Customer requires a Contractor to provide ...,2019-07-09 00:00:00,41102614,Research animal bedding material,Open,500000.0,Specialty Feeds Pty Ltd,
1,20221AGWA,Art Gallery of Western Australia,Goods and Services,Bar Consultancy and Staffing AGWA Rooftop Bar ...,The Art Gallery of Western Australia (AGWA) re...,2022-08-12 00:00:00,80101500,Business and corporate management consultation...,Open Advertisement,4303279.0,Ten Foot Tall Management Pty Ltd,Request Staffing AGWA PM Issued behalf Custome...
2,FIN873DLGSCAG,Art Gallery of Western Australia,Goods and Services,Investment Services for Art Gallery WA,"Through this Request, the Art Gallery of Weste...",2021-05-04 00:00:00,84121706,Financial asset management service,Open,830300.0,JBWere,INSERT NAME Table No Description DEFINITIONS D...
3,2020153WAM,Art Gallery of Western Australia,Goods and Services,Provision of Audience Research,Provision of Audience Research. This research ...,2022-02-04 00:00:00,80141500,Market research,Open Advertisement,300795.0,Morris Hargreaves McIntyre,
4,CUAHRS202117042023AC,Arts and Culture Trust,Goods and Services,Consultancy Services for HR Support,Provision of specialist services to undertake ...,2023-06-02 00:00:00,80110000,Human resources services,CUA,71000.0,Price Consulting Group Pty Ltd,


In [7]:
loader = DataFrameLoader(df, page_content_columns=[])

data = loader.load()

In [8]:
console.log(data[0].metadata)

In [9]:
# loader = CSVLoader(file_path=FILE_DIR, metadata_columns=headers)

# data = loader.load()

In [10]:
console.log(len(data))

In [11]:
# def split_docs(documents, chunk_size=2048, chunk_overlap=20):
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size, chunk_overlap=chunk_overlap
#     )
#     docs = text_splitter.split_documents(documents)
#     return docs


# docs = split_docs(data)
# print(len(docs))

In [12]:
from langchain.schema import Document


def chunk_docs(
    docs: list[Document], max_chunk_size: int, overlap: int = -1
) -> list[Document]:
    """
    Chunk documents into smaller documents
    :param docs: Documents
    :param metadatas: Documents metadata
    :param max_chunk_size:
    :param overlap: - if -1 then overlap is 10% of max_chunk_size
    :return:
    """
    _overlap = overlap
    if _overlap == -1:
        _overlap = int(max_chunk_size * 0.1)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size, chunk_overlap=_overlap, add_start_index=True
    )
    _out_docs = text_splitter.split_documents(docs)
    return _out_docs


docs = chunk_docs(data, 512)

In [13]:
console.log([doc.page_content for doc in docs[0:3]])

In [14]:
console.log([doc.metadata for doc in docs[0:5]])

In [15]:
# embeddings = SentenceTransformerEmbeddings(model_name="thenlper/gte-base")
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [16]:
def split_list(input_list, chunk_size):
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i : i + chunk_size]


split_docs_chunked = split_list(docs, 41000)

for split_docs_chunk in split_docs_chunked:
    vectordb = Chroma.from_documents(
        documents=split_docs_chunk,
        embedding=embeddings,
        persist_directory="./chroma_db",
        collection_metadata={"hnsw:space": "cosine"},
    )
    vectordb.persist()

In [17]:
console.log(len(docs))

In [35]:
db_disk = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings,
    collection_metadata={"hnsw:space": "cosine"},
)

In [36]:
query = "surveillance"
query_embedding = embeddings.embed_query(query)
matching_docs = db_disk.similarity_search_by_vector_with_relevance_scores(
    query_embedding, k=245000
)

In [37]:
result_dict = {}
for doc, score in matching_docs:
    row_id = doc.metadata["row"]
    if row_id not in result_dict:
        result_dict[row_id] = score
    else:
        result_dict[row_id] = min(result_dict[row_id], score)

In [38]:
_result_dict = {}
for doc, score in matching_docs:
    row_id = doc.metadata["row"]
    if row_id not in _result_dict:
        doc.metadata["similarity_score"] = score
        _result_dict[row_id] = doc.metadata
        _result_dict[row_id].pop("row")
        _result_dict[row_id].pop("start_index")
    else:
        _result_dict[row_id]["similarity_score"] = min(_result_dict[row_id]["similarity_score"], score)

In [49]:
values_list = list(_result_dict.values())

value0 = values_list[0]
# value0.pop("Tenders Content")

console.log(**value0)


In [21]:
console.log(matching_docs[0:5])

In [22]:
console.log(len(result_dict))

In [23]:
import numpy as np
import plotly.express as px

# Convert keys and values to numpy arrays
keys = np.array(list(map(str, result_dict.keys())))
values = 1 - np.array(list(result_dict.values()))
total = np.array(list(range(1, len(keys) + 1)))

# Create a DataFrame
df = pd.DataFrame({"Row ID": keys, "Similarity Score": values, "Total": total})

# Create the plot
fig = px.line(df, x="Row ID", y="Similarity Score", custom_data=["Total"])

fig.update_traces(
    hovertemplate="<br>".join(
        [
            "Row ID: %{x}",
            "Similarity Score: %{y}",
            "Total: %{customdata[0]}",
        ]
    )
)

# Display the plot
fig.show()