In [18]:
import yaml, os
import tiktoken
import numpy as np
import pandas as pd
import nest_asyncio
nest_asyncio.apply()

import tensorflow as tf
from llama_index.core import (
                            PromptTemplate,
                            Settings
                            )
from IPython.display import Markdown, display
from llama_index.readers.smart_pdf_loader import SmartPDFLoader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq

In [2]:
with open("secrets.yaml", 'r') as stream:
    try:
        secrets = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
        
tiktoken_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")
os.environ['LLAMA_CLOUD_API_KEY'] = secrets['LLAMA_CLOUD_API_KEY']
os.environ['GROQ_API_KEY'] = secrets['GROQ_API_KEY']

In [6]:
pdf_file_name = "data/AnnualReports/PDF/Agricultural - Madulsima 22,23.pdf"
pdf_loader = SmartPDFLoader(llmsherpa_api_url="https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all")
documents = pdf_loader.load_data(pdf_file_name)

doc_str = '\n'.join([doc.text for doc in documents])

n_tokens = len(tiktoken_tokenizer.encode(doc_str))
display(Markdown(f"Number of tokens in the document: {n_tokens}"))
display(Markdown(doc_str))

In [7]:
Settings.llm = Groq(model="llama3-70b-8192")
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [9]:
template = (
            "The provided context related to ANNUAL REPORTS of different companies in different industries.\n"
            "The document may also contain many tables, images, and other relevant information. The document may contain multiple sections.\n" 
            "---------------------\n"
            "{context_str}"
            "\n---------------------\n"
            "The task is to extract the following information from the context_str in below JSON format.\n"
            "Please make sure that, if you can't find the information, please leave the field empty. Do not mention anything else in that field\n"
            "---------------------\n"
            "{json_ir_format}"
            "\n---------------------\n"
            )

information_retrieval_json = {
                            "Revenue": "",
                            "Net Profit": "",
                            "Gross Profit": "",
                            "Total Assets": "",
                            "Total Liabilities": "",
                            "Equity": ""
                            }
qa_template = PromptTemplate(template)

messages = qa_template.format_messages(
                                    context_str=f"{doc_str}", 
                                    json_ir_format=f"{information_retrieval_json}"
                                    )

n_tokens = len(tiktoken_tokenizer.encode(messages[0].content))
display(Markdown(f"Number of tokens in the prompt with Context: {n_tokens}"))

In [None]:
# display(Markdown(Settings.llm.chat(messages).message.content))

In [3]:
def generate_dummy_data(pdf_file_name):
    try:
        response = Settings.llm.chat(messages)
        values_23 = response.message.content.split("\n")
    except:
        values_23 = {                    
                    "Revenue[Mil]": [np.random.randint(1000, 10000)],
                    "Net Profit[Mil]": [np.random.randint(100, 1000)],
                    "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                    "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                    "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                    "Equity[Mil]": [np.random.randint(1000, 10000)]
                    }
        
    values_22 = {                    
                "Revenue[Mil]": [np.random.randint(1000, 10000)],
                "Net Profit[Mil]": [np.random.randint(100, 1000)],
                "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                "Equity[Mil]": [np.random.randint(1000, 10000)]
                }

    values_21 = {
                "Revenue[Mil]": [np.random.randint(1000, 10000)],
                "Net Profit[Mil]": [np.random.randint(100, 1000)],
                "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                "Equity[Mil]": [np.random.randint(1000, 10000)]
                }       

    values_20 = {
                "Revenue[Mil]": [np.random.randint(1000, 10000)],
                "Net Profit[Mil]": [np.random.randint(100, 1000)],
                "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                "Equity[Mil]": [np.random.randint(1000, 10000)]
                }

    values_19 = {
                "Revenue[Mil]": [np.random.randint(1000, 10000)],
                "Net Profit[Mil]": [np.random.randint(100, 1000)],
                "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                "Equity[Mil]": [np.random.randint(1000, 10000)]
                }   

    values_18 = {
                "Revenue[Mil]": [np.random.randint(1000, 10000)],
                "Net Profit[Mil]": [np.random.randint(100, 1000)],
                "Gross Profit[Mil]": [np.random.randint(100, 1000)],
                "Total Assets[Mil]": [np.random.randint(1000, 10000)],
                "Total Liabilities[Mil]": [np.random.randint(1000, 10000)],
                "Equity[Mil]": [np.random.randint(1000, 10000)]
                }

    data_18 = pd.DataFrame(values_18)
    data_19 = pd.DataFrame(values_19)
    data_20 = pd.DataFrame(values_20)
    data_21 = pd.DataFrame(values_21)
    data_22 = pd.DataFrame(values_22)
    data_23 = pd.DataFrame(values_23)

    df = pd.concat([data_18, data_19, data_20, data_21, data_22, data_23], axis=0)
    df['Year'] = [2018, 2019, 2020, 2021, 2022, 2023]
    df.to_csv(pdf_file_name.replace('/PDF/', '/CSV/').replace('.pdf', '.csv'), index=False)

In [4]:
for pdf_file_name in os.listdir("data/AnnualReports/PDF/"):
    pdf_file_name = f"data/AnnualReports/PDF/{pdf_file_name}"
    generate_dummy_data(pdf_file_name)

In [13]:
def build_ts_on_csv(
                    csv_file_name,
                    selective_windows = 3,
                    prediction_window = 1
                    ):
    df = pd.read_csv(csv_file_name)
    df = df.sort_values(
                        by='Year', 
                        ascending=True
                        )
    df = df.set_index('Year')

    x, y = [], []
    for i in range(len(df) - selective_windows - prediction_window + 1):
        x.append(df.iloc[i:i+selective_windows].values)
        y.append(df.iloc[i+selective_windows:i+selective_windows+prediction_window].values)
    return x, y

def build_ts_data(csv_dir = "data/AnnualReports/CSV/"):
    X, Y = [], []
    for csv_file_name in os.listdir(csv_dir):
        csv_file_name = f"{csv_dir}{csv_file_name}"
        x, y = build_ts_on_csv(csv_file_name)
        X.extend(x)
        Y.extend(y)

    X = np.array(X)
    Y = np.array(Y)

    return X, Y

In [17]:
X, Y = build_ts_data()
print("X Shape: ", X.shape)
print("Y Shape: ", Y.shape)

X Shape:  (15, 3, 6)
Y Shape:  (15, 1, 6)


In [35]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X.shape[1], X.shape[2])),
                                    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(Y.shape[2]))  
                                    ])    
model.summary()

model.compile(
            optimizer='adam', 
            loss='mse'
            )
model.fit(
        X, Y, 
        batch_size=2,
        epochs=1000
        )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 31062822.0000  
Epoch 2/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 18748786.0000 
Epoch 3/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 14534371.0000 
Epoch 4/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 8834283.0000  
Epoch 5/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 6990204.0000  
Epoch 6/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 5551244.5000 
Epoch 7/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4492342.0000 
Epoch 8/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4429260.0000 
Epoch 9/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4634775.5000 
Epoch 10/1000
[1m8/8[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x14965db9240>

In [36]:
model.save("models/Annual/annual_report_model.h5")



In [49]:
def predict_on_industry(industry):
    csv_dir = f"data/AnnualReports/CSV/{industry}.csv"
    df = pd.read_csv(csv_dir)
    df = df.sort_values(
                        by='Year', 
                        ascending=True
                        )
    df = df.tail(n=3)
    del df['Year']
    
    X = df.values.reshape(1, df.shape[0], df.shape[1])
    P = model.predict(X).squeeze()

    avg = df.mean().values
    # Calculate the percentage change
    return {
            "Revenue": f"{(P[0] - avg[0]) / avg[0] * 100} %",
            "Net Profit": f"{(P[1] - avg[1]) / avg[1] * 100} %",
            "Gross Profit": f"{(P[2] - avg[2]) / avg[2] * 100} %",
            "Total Assets": f"{(P[3] - avg[3]) / avg[3] * 100} %",
            "Total Liabilities": f"{(P[4] - avg[4]) / avg[4] * 100} %",
            "Equity": f"{(P[5] - avg[5]) / avg[5] * 100} %"
            }
    

In [50]:
predict_on_industry('Agricultural Raw materials')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


{'Revenue': '82.13988430001305 %',
 'Net Profit': '1.2994946090112047 %',
 'Gross Profit': '188.6000721490205 %',
 'Total Assets': '-25.15346092636227 %',
 'Total Liabilities': '-46.97727902066841 %',
 'Equity': '-35.83289807438323 %'}