In [10]:
from dotenv import load_dotenv
load_dotenv(".env")

import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ['LANGCHAIN_PROJECT'] = "eval_dataset_Creation"

## Importing the required packages

In [11]:
import json

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings # To create embeddings
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore # To connect with the Vectorstore
import pandas as pd

### Defining globle variables

In [12]:
df = pd.read_csv("earning_calls_ground_truth.csv")

# df['ticker'].unique()

In [13]:
company_ticker_dict = {
 "Adani Enterprises Ltd": "adanient",
 "Adani Ports and Special Economic Zone Ltd": "adaniports",
 "Apollo Hospitals": "apollohosp",
 "Asian Paints": "asianpaint",
 "Axis Bank": "axisbank",
 "Bajaj Auto": "bajaj-auto",
 "Bajaj Finance": "bajfinance",
 "Bharat Petroleum": "bpcl",
 "Bharti Airtel": "bhartiartl",
 "Britannia Industries": "britannia",
 "Cipla": "cipla",
 "Divi's Laboratories": "divislab",
 "Dr. Reddy's Laboratories": "drreddy",
 "Eicher Motors": "eichermot",
 "Grasim Industries": "grasim",
 "HCLTech": "hcltech",
 "HDFC Bank": "hdfcbank",
 "HDFC Life": "hdfclife",
 "Hero MotoCorp": "heromotoco",
 "Hindalco Industries": "hindalco",
 "Hindustan Unilever": "hindunilvrv",
 "ICICI Bank": "icicibank",
 "IndusInd Bank": "indusindbk",
 "Infosys": "infy",
 "JSW Steel": "jswsteel",
 "Kotak Mahindra Bank": "kotakbank",
 "Larsen & Toubro": "lt",
 "LTIMindtree": "ltim",
 "Mahindra & Mahindra": "m&m",
 "Maruti Suzuki": "maruti",
 "Reliance Industries": "reliance",
 "SBI Life Insurance Company": "sbilife",
 "State Bank of India": "sbin",
 "Sun Pharma": "sunpharma",
 "Tata Consultancy Services": "tcs",
 "Tata Consumer Products": "tataconsum",
 "Tata Steel": "tatasteel",
 "Tech Mahindra": "techm",
 "Titan Company": "titan",
 "UltraTech Cement": "ultracemco",
 "UPL": "upl",
 "Wipro": "wipro"
}

In [14]:
# for key, value in company_ticker_dict.items():
#     temp = df[df['ticker']==value]
#     print(f"company name: {key}")
#     print(f"data shape: {temp.shape}")

In [15]:
INDEX_NAME = 'earning-calls-euclidean'
TOP_K = 4
QUARTER = "Q1"
YEAR = "FY24"

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
index = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings) # loading the index

In [16]:
chat_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."),
        ("human", "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query}\nAnswer: "),
    ]
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def get_rag_chain(retriver=None):
    rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | chat_template
        | llm
        | StrOutputParser()
    )

    rag_chain_with_source = RunnableParallel(
        {"context": retriver, "query": RunnablePassthrough()}
    ).assign(answer=rag_chain_from_docs)
    return rag_chain_with_source

In [17]:
for key, value in company_ticker_dict.items():
    questions = []
    contexts = []
    answers = []
    ground_truths = []

    FILENAME = f"{key}.pdf"
    retriver = index.as_retriever(search_kwargs={"filter": {"quarter": QUARTER, "filename": FILENAME, "year": YEAR}, "k": TOP_K})
    rag_chain_with_source = get_rag_chain(retriver=retriver)

    temp = df[df['ticker']==value]
    for idx, row in temp.iterrows():
        response = rag_chain_with_source.invoke(row["questions_asked"])
        # answers
        questions.append(row['questions_asked'])
        contexts.append([context.page_content for context in response['context']])
        answers.append(response['answer'])
        ground_truths.append(row['answers'])
        
    res_df = pd.DataFrame({"question": questions, "answer": answers, "contexts": contexts, "ground_truth": ground_truths, "file_name": [FILENAME for _ in range(len(questions))]})
    print(res_df.shape, FILENAME)
    # res_df.to_csv(key+".csv", index=False)
    res_df.to_csv("eval_data/" + key + ".csv", index=False)    
    res_df.to_json("eval_data_json/"+ key + ".json", orient="records", indent=4)

(10, 5) Adani Enterprises Ltd.pdf
(10, 5) Adani Ports and Special Economic Zone Ltd.pdf
(10, 5) Apollo Hospitals.pdf
(10, 5) Asian Paints.pdf
(10, 5) Axis Bank.pdf
(10, 5) Bajaj Auto.pdf
(10, 5) Bajaj Finance.pdf
(8, 5) Bharat Petroleum.pdf
(10, 5) Bharti Airtel.pdf
(10, 5) Britannia Industries.pdf
(10, 5) Cipla.pdf
(10, 5) Divi's Laboratories.pdf
(10, 5) Dr. Reddy's Laboratories.pdf
(10, 5) Eicher Motors.pdf
(10, 5) Grasim Industries.pdf
(10, 5) HCLTech.pdf
(10, 5) HDFC Bank.pdf
(10, 5) HDFC Life.pdf
(10, 5) Hero MotoCorp.pdf
(10, 5) Hindalco Industries.pdf
(0, 5) Hindustan Unilever.pdf
(10, 5) ICICI Bank.pdf
(10, 5) IndusInd Bank.pdf
(10, 5) Infosys.pdf
(11, 5) JSW Steel.pdf
(10, 5) Kotak Mahindra Bank.pdf
(10, 5) Larsen & Toubro.pdf
(10, 5) LTIMindtree.pdf
(10, 5) Mahindra & Mahindra.pdf
(10, 5) Maruti Suzuki.pdf
(10, 5) Reliance Industries.pdf
(10, 5) SBI Life Insurance Company.pdf
(10, 5) State Bank of India.pdf
(10, 5) Sun Pharma.pdf
(10, 5) Tata Consultancy Services.pdf
(10, 5) 