In [1]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
from dotenv import load_dotenv
import os
import numpy as np

In [10]:
static_topics = ["Supplier Environmental Assessment", "Materials", "Energy", "Waste", "Water & Effluents", "Emissions", "Environmental Compliance", "Biodiversity"]

def generate_static_topic_desc(static_topics: list[str], llm):
    topic_desc = {}
    for topic in static_topics:
        topic_desc[topic] = llm.invoke(f"Generate a detailed paragraph explaining the topic {topic}").content

    return topic_desc

def generate_topic_embeddings(topic_desc: dict[str, str], embeddings_model):
    embeddings = embeddings_model.embed_documents(list(topic_desc.values()))
    
    topic_desc_embeddings = {}
    for i, topic in enumerate(topic_desc.keys()):
        topic_desc_embeddings[topic] = embeddings[i]

    return topic_desc_embeddings

def get_topic_embedding_db(topic_desc: dict[str, str], embeddings_model):
    documents = [Document(page_content=desc, metadata={"topic": topic}) for topic, desc in topic_desc.items()]
    vector_db = Chroma.from_documents(documents=documents, embedding=embeddings_model)

    return vector_db

def map_lda2topic(lda_topics, static_topic_db):
    mapped_topics = []
    retriever = static_topic_db.as_retriever(search_type='similarity')
    for topic in lda_topics:
        retrieved = retriever.invoke(topic, search_kwargs={"k": 1})

        mapped_topics.append(retrieved[0].metadata["topic"])

    return mapped_topics

In [3]:
dot_env_path = os.path.join(os.getcwd(), '.env')

if os.path.exists(dot_env_path):
    load_dotenv(dot_env_path)

llm = ChatOpenAI()
embeddings_model = OpenAIEmbeddings()

In [4]:
topic_desc = generate_static_topic_desc(static_topics, llm)

In [5]:
# topic_desc_emb = generate_topic_embeddings(topic_desc, embeddings_model)

In [6]:
vector_topic_db = get_topic_embedding_db(topic_desc, embeddings_model)

In [7]:
with open("../topics_IN.txt") as f:
    file = f.readlines()
    file = [line.split(":")[1].lstrip() for line in file]

In [11]:
ld2map = map_lda2topic(file, vector_topic_db)

In [15]:
import pandas as pd

df = pd.DataFrame()

for i, (topic, category) in enumerate(zip(file, ld2map)):
    df_ = pd.DataFrame({"index": [f"Topic_{i+1}"], "topic": [topic], "category": [category]})
    df = pd.concat([df, df_], ignore_index=True)

In [16]:
df

Unnamed: 0,index,topic,category
0,Topic_1,climate change himalayan national capacity tec...,Biodiversity
1,Topic_2,mission national eco system himalayan sustaini...,Biodiversity
2,Topic_3,climate change national action plan developmen...,Emissions
3,Topic_4,forest ha climate ecosystem adaptation change ...,Biodiversity
4,Topic_5,soil health card nutrient provide farmer impro...,Biodiversity
5,Topic_6,water resource per management area ensure prom...,Water & Effluents
6,Topic_7,fodder carbon around another annual well timbe...,Emissions
7,Topic_8,national mission sustainable green water india...,Water & Effluents
8,Topic_9,energy efficiency mission solar gw power nmeee...,Energy
9,Topic_10,sustainable energy efficiency aim management s...,Environmental Compliance


In [19]:
df.to_csv('../topics_IN.csv', index=False)

## Experiments

In [None]:
import pandas as pd