In [4]:
import os
import PyPDF2
import markdown2
import json
import pickle
import pandas as pd

from docx import Document as DocxDocument
from pptx import Presentation

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings.base import OpenAIEmbeddings

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Rinats well documented sample project
# https://github.com/trustbit/private-poc-fmw-content-generator/tree/main/backend-and-kb/src/fmw

## Get Companies

In [6]:
df = pd.read_csv("data/dataset.csv") 
df

Unnamed: 0,sha1,date,name,size
0,ce9e5024041b2ece2bafa2a9d9516bb174ee8949,2022-10-31,"Anixa Biosciences, Inc.",3996701
1,f71415f9ca0cff70e5fa193616b6197f361130ed,2023-02-21,"Maravai LifeSciences Holdings, Inc.",4033642
2,4a9d2b853e05970776121a810460f0962a18c5a1,2022-XX,KLA Corporation,1181894
3,f973dd219c534accb0d4e72d8e12f51284d48d10,2023-01-01,"Ameresco, Inc.",10648267
4,4e27f4c3402c657d548760cb3a164b036cefaabb,2022-12-31,Battery Minerals Limited,3650701
...,...,...,...,...
7490,a2afcd8165a6dbd0058682680b65d3638a5800eb,2023-02-02,"Arrow Electronics, Inc",922750
7491,215df84494756bd4feebc973657835ef7f14ee16,2022-12-31,Synertec,10474497
7492,20fb970d8705289e835b408c575351295ac16f5f,2022-09-30,TE Connectivity,5753097
7493,c7f3a8c0a38c756438950ce3085076adb4241a32,2022-01-01,OTC_ADDDF,12603054


In [32]:
companies = ["Ethernity Networks Ltd", "Limbach Holdings, Inc.", "Accuray Incorporated"]

if companies:
    df_sample = df[df.name.isin(companies)]
else:
    df_sample = df.copy()

In [36]:
df_sample

Unnamed: 0,sha1,date,name,size
1183,99be213e4e689294ebae809bfa6a1b5024076286,2022-01-01,"Limbach Holdings, Inc.",2023552
5801,e51b7204b91cbe7709bd3218e7d2d0c2b8dbb438,2023-01-01,Ethernity Networks Ltd,1180978
6859,dd78f748262b8ffa62de6484143ff55b38af24c7,2022-06-30,Accuray Incorporated,3317389


In [53]:
df_sample["size"].sum()

6521919

## Get company name with fuzzy matching

In [77]:
import re
from fuzzywuzzy import fuzz

def normalize_name(name):
    # Normalize by converting to lowercase and removing special characters
    return re.sub(r'[^a-z0-9\s]', '', name.lower()).strip()

def fuzzy_match(companies, input_string, threshold=80):
    matched_companies = []
    norm_input = normalize_name(input_string)
    for company in companies:
        norm_company = normalize_name(company)
        
        # Fuzzy match each token
        for token in norm_input.split():
            match_score = fuzz.ratio(norm_company, token)
            if match_score >= threshold:
                matched_companies.append(company)
                break  # Stop once a match is found for this company

    return matched_companies

# Example Usage
companies = [
    "Inozyme Pharma, Inc.", 
    "XRF Scientific Limited", 
    "ASEANA PROPERTIES LIMITED", 
    "abrdn European Logistics Income plc"
]

input_string = "inozyme pharma; abrdn eur logistic; xrf sci ltd"

matches = fuzzy_match(df.name, input_string)
print(matches)

['Pharmaxis', 'TD', 'ID Logistics', 'PharmaMar']


## Load LLM data

In [33]:
folder_path = r"C:\Users\felix.krause\code\trustbit\enterprise-rag-challenge\samples"

# embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2" # Hugging Face model

# load key from json
with open("secrets.json") as f:
    keys = json.load(f)
    openai_key = keys["OPENAI_API_KEY"]

In [43]:
# Function to load text from different file types
def load_text_from_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    text = ""
    
    if file_extension == ".pdf":
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f, strict=False)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    else:
        pass
    
    return text


# Function to load documents from a folder
def load_documents_from_folder(folder_path, companies=[]):
    documents = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file[:-4] in companies: #or len(companies) == 0:
                print("Loading", file)
                file_path = os.path.join(root, file)

                text = load_text_from_file(file_path)
                if text:
                    documents.append(Document(page_content=text, metadata={"source": file_path}))
    return documents

# Load documents from the specified folder
documents = load_documents_from_folder(folder_path, df_sample.sha1.to_list())

FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead


Loading 99be213e4e689294ebae809bfa6a1b5024076286.pdf


FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead
FloatObject (b'0.00-40') invalid; use 0.0 instead


Loading dd78f748262b8ffa62de6484143ff55b38af24c7.pdf
Loading e51b7204b91cbe7709bd3218e7d2d0c2b8dbb438.pdf


In [44]:
# Create vector store
# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

if len(texts) == 0:
    raise ValueError("No text found in the specified folder")

In [45]:
embeddings = OpenAIEmbeddings(api_key=openai_key) # deprecated?
db = FAISS.from_documents(texts, embeddings)
retriever = db.as_retriever()

In [54]:
# Store vector database
# with open("data/first_sample_db_3", "wb") as f:
#     pickle.dump(db.serialize_to_bytes(), f)

In [8]:
# Load vector database
# with open("data/vector_db_uni", "rb") as f:
#     db_bytes = pickle.load(f)
#     db = FAISS.deserialize_from_bytes(db_bytes, OpenAIEmbeddings(api_key=openai_key),
#                                       allow_dangerous_deserialization=True)
#     retriever = db.as_retriever()

## Define LLM

In [55]:
llm = ChatOpenAI(api_key=openai_key, temperature=0)
# llm = llm.bind(logprobs=True, top_logprobs=3)
# could try local models as well

## Inference with chain

In [70]:
# load system prompt from .md
with open("data/system_prompt.md") as f:
    system_prompt_txt = f.read()

system_prompt = (
    system_prompt_txt + "\n"
    
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [71]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

# Function to ask a question
def ask_question(query):
    response = chain.invoke({"input": query})
    answer = response["answer"]
    print(f"Question: {query}\nAnswer: {answer}")

```python
    ('What was the {fin_metric} of "{company}" in {time_frame}?', "number"),
    ('How much did "{company}" spend on {focus_area} in {time_frame}?', "number"),
    ('What was the {ratio_or_metric} of "{company}" in {time_frame}?', "number"),
    ('How many {count_metric} did "{company}" have in {time_frame}?', "number"),
    ('Which company had a higher {fin_metric}: "{company1}", "{company2}" or "{company3}", in {time_frame}?', "name"),
    ('Did "{company1}" have a greater {ratio_or_metric} than "{company2}" in {time_frame}?', "boolean"),
    ('How much more did "{company1}" spend on {focus_area} compared to "{company2}" in {time_frame}?', "number"),
    ('Who is the {role} in the company "{company}"?', "name"),
```

In [72]:
# Ask a question
ask_question("How many assets does 'Accuray Incorporated' have in 2022 in dollars?") # 472.849 

Question: How many assets does 'Accuray Incorporated' have in 2022 in dollars?
Answer: 350890


In [73]:
ask_question("How many assets does 'Accuray Incorporated' have in 2021 in dollars?") # 480.098

Question: How many assets does 'Accuray Incorporated' have in 2021 in dollars?
Answer: 352773


In [74]:
ask_question("How many liabilities does 'Accuray Incorporated' have in 2021 in dollars?") # 411.258

Question: How many liabilities does 'Accuray Incorporated' have in 2021 in dollars?
Answer: n/a


In [78]:
ask_question("How many stores did 'Strike Energy Limited' have in the end of fiscal year 2021?") # n/a

Question: How many stores did 'Strike Energy Limited' have in the end of fiscal year 2021?
Answer: n/a


In [83]:
# Total R&D Expenses of Ethernity Networks
ask_question("What are the total R&D expenses of 'Ethernity Networks' in 2021?") # 5 550 912

ask_question("What are the total R&D expenses of 'Ethernity Networks' in 2022?") # 6 618 795

Question: What are the total R&D expenses of 'Ethernity Networks' in 2021?
Answer: 5550912
Question: What are the total R&D expenses of 'Ethernity Networks' in 2022?
Answer: 6618795
