## Convert JSON to PDF

In [1]:
import json
from fpdf import FPDF
import os

In [2]:
path = os.path.join('bankstatement')

In [3]:
userId = "0001"
jsonPath = os.path.join(path, f"{userId}.json")
data = json.load(open(f"{jsonPath}"))

In [4]:
def draw_table_headers(pdf):
    pdf.set_fill_color(200, 220, 255)
    pdf.cell(col_widths[0], 10, "Date", 1, 0, "C", True)
    pdf.cell(col_widths[1], 10, "Description", 1, 0, "C", True)
    pdf.cell(col_widths[2], 10, "Debit", 1, 0, "C", True)
    pdf.cell(col_widths[3], 10, "Credit", 1, 0, "C", True)
    pdf.cell(col_widths[4], 10, "Balance", 1, 1, "C", True)

In [5]:
pdf = FPDF()
col_widths = [30, 60, 30, 30, 30]

pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Bank Statement", ln=True, align="C")
pdf.ln(10)
draw_table_headers(pdf)

for transaction in data:
    if pdf.get_y() > 200:
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        draw_table_headers(pdf)
    pdf.cell(col_widths[0], 10, transaction["date"], 1, 0, "C")
    pdf.cell(col_widths[1], 10, transaction["description"], 1, 0, "L")
    pdf.cell(col_widths[2], 10, str(transaction["debit"]), 1, 0, "C")
    pdf.cell(col_widths[3], 10, str(transaction["credit"]), 1, 0, "C")
    pdf.cell(col_widths[4], 10, str(transaction["balance"]), 1, 1, "C")

outputPath = path
pdf.output(os.path.join(path, f"{userId}.pdf"))

''

## PDF Loader

In [6]:
from langchain_community.document_loaders import PyPDFLoader

pdfPath = os.path.join(path, f"{userId}.pdf")
loader = PyPDFLoader(pdfPath)
pages = loader.load_and_split()

In [90]:
pages

[Document(page_content='Bank Statement\nDate Description Debit Credit Balance\n2020-01-01 groceries 450.0 0.0 1000.0\n2020-01-02 salary 0.0 2000.0 3000.0\n2020-01-03 dinner 1000.0 0.0 2000.0\n2020-01-04 friend debt 0.0 6000.0 8000.0\n2020-01-01 groceries 450.0 0.0 1000.0\n2020-01-02 salary 0.0 2000.0 3000.0\n2020-01-03 dinner 1000.0 0.0 2000.0\n2020-01-04 friend debt 0.0 6000.0 8000.0\n2020-01-05 utilities 300.0 0.0 7700.0\n2020-01-06 rent 1200.0 0.0 6500.0\n2020-01-07 movie tickets 50.0 0.0 6450.0\n2020-01-08 online shopping 500.0 0.0 5950.0\n2020-01-09 car service 200.0 0.0 5750.0\n2020-01-10 medical expenses 800.0 0.0 4950.0\n2020-01-11 groceries 450.0 0.0 4500.0\n2020-01-12 salary 0.0 2000.0 6500.0', metadata={'source': 'bankstatement\\0001.pdf', 'page': 0}),
 Document(page_content='Date Description Debit Credit Balance\n2020-01-13 dinner 1000.0 0.0 5500.0\n2020-01-14 friend debt 0.0 6000.0 11500.0\n2020-01-15 utilities 300.0 0.0 11200.0\n2020-01-16 rent 1200.0 0.0 10000.0\n2020-01

In [92]:
page_content = [doc.page_content for doc in pages]
print(page_content)

['Bank Statement\nDate Description Debit Credit Balance\n2020-01-01 groceries 450.0 0.0 1000.0\n2020-01-02 salary 0.0 2000.0 3000.0\n2020-01-03 dinner 1000.0 0.0 2000.0\n2020-01-04 friend debt 0.0 6000.0 8000.0\n2020-01-01 groceries 450.0 0.0 1000.0\n2020-01-02 salary 0.0 2000.0 3000.0\n2020-01-03 dinner 1000.0 0.0 2000.0\n2020-01-04 friend debt 0.0 6000.0 8000.0\n2020-01-05 utilities 300.0 0.0 7700.0\n2020-01-06 rent 1200.0 0.0 6500.0\n2020-01-07 movie tickets 50.0 0.0 6450.0\n2020-01-08 online shopping 500.0 0.0 5950.0\n2020-01-09 car service 200.0 0.0 5750.0\n2020-01-10 medical expenses 800.0 0.0 4950.0\n2020-01-11 groceries 450.0 0.0 4500.0\n2020-01-12 salary 0.0 2000.0 6500.0', 'Date Description Debit Credit Balance\n2020-01-13 dinner 1000.0 0.0 5500.0\n2020-01-14 friend debt 0.0 6000.0 11500.0\n2020-01-15 utilities 300.0 0.0 11200.0\n2020-01-16 rent 1200.0 0.0 10000.0\n2020-01-17 movie tickets 50.0 0.0 9950.0\n2020-01-18 online shopping 500.0 0.0 9450.0\n2020-01-19 car service 20

## CohereEmbedding + ChromaDB

In [79]:
# from langchain_cohere import CohereEmbeddings
# from langchain_chroma import Chroma

In [80]:
# from dotenv import load_dotenv
# load_dotenv()

True

In [81]:
# dbPath = os.path.join(path)
# dbPath

'bankstatement'

In [82]:
# cohere_api_key = os.getenv("COHERE_API_KEY")
# db = Chroma.from_documents(pages, CohereEmbeddings(cohere_api_key=cohere_api_key), persist_directory=os.path.join(dbPath, "chroma"))

In [78]:
# db.delete_collection()

## Trial Only

In [83]:
# query = CohereEmbeddings().embed_query("how much i spend on movie tickets")
# print(query)

[0.5605469, 1.0410156, 4.6289062, -0.88623047, 2.1386719, 1.2988281, 2.265625, 1.2324219, 0.3762207, -0.6538086, -0.55908203, 0.5966797, 0.34033203, 1.9042969, 2.0371094, 0.23071289, -1.0205078, -0.20043945, 0.13684082, -0.82910156, 0.64404297, 0.9580078, -1.9287109, 0.38793945, -2.6269531, 1.7255859, -0.5571289, -0.76708984, 1.5722656, 1.9824219, 0.7680664, -4.2460938, 0.63916016, -0.82666016, 0.82470703, -0.4555664, 0.36889648, 0.8305664, -3.1875, 0.07598877, -3.6503906, -0.82177734, 1.5957031, 0.6923828, -1.4130859, 0.22485352, 1.7353516, -0.35107422, 0.105285645, 0.44482422, -1.2490234, -2.5292969, 4.390625, -1.1982422, -0.30249023, -0.32495117, 3.2441406, 1.3916016, 0.27416992, -0.28686523, -0.18054199, -2.7539062, 2.4707031, 0.87841797, 0.23132324, -0.8432617, -0.08483887, 2.2402344, 1.8154297, -1.0722656, -0.80029297, 0.24377441, 2.7714844, 1.6416016, -0.31445312, -0.08117676, 1.5371094, 3.2304688, 0.24584961, 1.1308594, -1.0244141, -1.2001953, 0.42456055, 1.9863281, -1.2666016,

In [84]:
# result = db.similarity_search_by_vector(query)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [85]:
# print(result[0].page_content)

Date Description Debit Credit Balance
2020-01-31 groceries 450.0 0.0 11500.0
2020-01-01 groceries 450.0 0.0 1000.0
2020-01-02 salary 0.0 2000.0 3000.0
2020-01-03 dinner 1000.0 0.0 2000.0
2020-01-04 friend debt 0.0 6000.0 8000.0
2020-01-05 utilities 300.0 0.0 7700.0
2020-01-06 rent 1200.0 0.0 6500.0
2020-01-07 movie tickets 50.0 0.0 6450.0
2020-01-08 online shopping 500.0 0.0 5950.0
2020-01-09 car service 200.0 0.0 5750.0
2020-01-10 medical expenses 800.0 0.0 4950.0
