In [1]:
!pip install langchain_community
!pip install langchain
!pip install langchain_openai
!pip install python-dotenv
!pip install pypdf
!pip install jq
!pip install pathlib
!pip install flask
!pip install faiss-cpu



In [2]:
import pandas as pd
import numpy as np
import json
import csv
import pypdf

from flask import Flask, request, jsonify

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
from langchain.output_parsers import RegexParser
from dotenv import load_dotenv

from typing import List

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


In [3]:
#load API Key
load_dotenv('keys.env')

In [4]:
loader = DirectoryLoader('/Users/arthurpoon/Documents/_MPCS/GenAI/Final_Project/XBRL_JSON_files', glob="./*.json", loader_cls=TextLoader)
documents = loader.load()

In [5]:
chunk_size_value = 1000
chunk_overlap=100
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_value, chunk_overlap=chunk_overlap,length_function=len)
texts = text_splitter.split_documents(documents)
docembeddings = FAISS.from_documents(texts, OpenAIEmbeddings())
docembeddings.save_local("llm_faiss_index")
docembeddings = FAISS.load_local("llm_faiss_index",OpenAIEmbeddings())

In [17]:
prompt_template = """Use the following pieces of context to answer the question at the end.

This should be in the following format:

Question: [question here]
Helpful Answer: [answer here]
Score: [score between 0 and 100]

Begin!

Context:
---------
{context}
---------
Question: {question}
Helpful Answer:"""

output_parser = RegexParser(
    regex=r"(.*?)\nScore: (.*)",
    output_keys=["answer", "score"],
)
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"],
    output_parser=output_parser
)
chain = load_qa_chain(OpenAI(model_name = 'gpt-4-0125-preview',temperature=0), chain_type="map_rerank", return_intermediate_steps=True, prompt=PROMPT)

def getanswer(query):
    relevant_chunks = docembeddings.similarity_search_with_score(query,k=2)
    chunk_docs=[]
    for chunk in relevant_chunks:
        chunk_docs.append(chunk[0])
    results = chain({"input_documents": chunk_docs, "question": query})
    text_reference=""
    for i in range(len(results["input_documents"])):
        text_reference+=results["input_documents"][i].page_content
    output={"Answer":results["output_text"],"Reference":text_reference}
    return output


def processclaim(input_query):
    try:
        output=getanswer(input_query)
        return output
    except:
        return "Error"

#st.write(processclaim(text_query))

In [18]:
#Running queries from financebench
XBRL_finance_bench_df = pd.read_csv('/Users/arthurpoon/Documents/_MPCS/GenAI/Final_Project/FinanceBench_XBRL_subset.csv')
metrics_queriies =XBRL_finance_bench_df['question'] 

In [19]:
XBRL_finance_bench_with_RAG_answers_df = XBRL_finance_bench_df
pdf_query_answers = []
for row_index, row in XBRL_finance_bench_with_RAG_answers_df.iterrows():
    pdf_query_answers.append(processclaim(row['question']))

XBRL_finance_bench_with_RAG_answers_df['pdf_query_answers'] = pdf_query_answers
#XBRL_finance_bench_with_RAG_answers



In [20]:
XBRL_finance_bench_with_RAG_answers_df

Unnamed: 0,financebench_id,doc_name,doc_link,XBRL_doc_link,doc_period,question_type,question,answer,evidence_text,page_number,pdf_query_answers
0,financebench_id_02987,ACTIVISIONBLIZZARD_2019_10K,https://investor.activision.com/static-files/3...,https://www.sec.gov/Archives/edgar/data/718877...,2019,metrics-generated,What is the FY2019 fixed asset turnover ratio ...,24.26,"Table of Contents\nACTIVISION BLIZZARD, INC. A...",6970,{'Answer': ' It is not possible to calculate t...
1,financebench_id_07966,ACTIVISIONBLIZZARD_2019_10K,https://investor.activision.com/static-files/3...,https://www.sec.gov/Archives/edgar/data/718877...,2019,metrics-generated,What is the FY2017 - FY2019 3 year average of ...,1.90%,"Table of Contents\nACTIVISION BLIZZARD, INC. A...",7073,{'Answer': ' The average capex as a % of reven...
2,financebench_id_10420,AES_2022_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...,https://www.sec.gov/Archives/edgar/data/874761...,2022,metrics-generated,Based on the information provided primarily in...,-0.02,128 \nConsolidated Balance Sheets\nDecember...,130132,{'Answer': ' It is not possible to calculate A...
3,financebench_id_08286,AMAZON_2019_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0001...,https://www.sec.gov/Archives/edgar/data/101872...,2019,metrics-generated,By drawing conclusions from the information st...,"$11,588.00","Table of Contents\nAMAZON.COM, INC.\nCONSOLIDA...",38,{'Answer': ' It is not possible to determine A...
4,financebench_id_03882,AMCOR_2020_10K,https://portalvhds1fxb0jchzgjph.blob.core.wind...,https://www.sec.gov/Archives/edgar/data/174879...,2020,metrics-generated,What is Amcor's year end FY2020 net AR (in USD...,"$1,616.00",Amcor plc and Subsidiaries\nConsolidated Balan...,50,{'Answer': ' Based on the information provided...
5,financebench_id_05718,AMERICANWATERWORKS_2020_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0001...,https://www.sec.gov/Archives/edgar/data/141063...,2020,metrics-generated,How much (in USD billions) did American Water ...,$0.40,Table of Contents\nAmerican Water Works Compan...,86,{'Answer': ' It is not possible to determine t...
6,financebench_id_04254,AMERICANWATERWORKS_2021_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0001...,https://www.sec.gov/Archives/edgar/data/141063...,2021,metrics-generated,Basing your judgments off of the cash flow sta...,"$1,832.00",Table of Contents\nAmerican Water Works Compan...,8688,{'Answer': ' It is not possible to accurately ...
7,financebench_id_04417,BESTBUY_2019_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...,https://www.sec.gov/Archives/edgar/data/764478...,2019,metrics-generated,What is the year end FY2019 total amount of in...,"$5,409.00",Table of Contents\nConsolidated Balance Sheets...,52,"{'Answer': ' $629 million', 'Reference': 'styl..."
8,financebench_id_03838,BLOCK_2020_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0001...,https://www.sec.gov/Archives/edgar/data/151267...,2020,metrics-generated,What is the FY2019 - FY2020 total revenue grow...,101.50%,"SQUARE, INC.\nCONSOLIDATED STATEMENTS OF OPERA...",86,{'Answer': ' The FY2019 - FY2020 total revenue...
9,financebench_id_07661,BLOCK_2020_10K,https://d18rn0p25nwr6d.cloudfront.net/CIK-0001...,https://www.sec.gov/Archives/edgar/data/151267...,2020,metrics-generated,"Using the cash flow statement, answer the foll...",$382.00,"SQUARE, INC.\nCONSOLIDATED STATEMENTS OF CASH ...",90,{'Answer': ' It is not possible to answer this...


In [11]:
XBRL_finance_bench_with_RAG_answers_df.to_excel('XBRL_json_query_answers.xlsx')