In [1]:
import boto3
import time
import os
import re
from decimal import Decimal, InvalidOperation

from dotenv import load_dotenv

from SeekerTruther import Seeker, Truther
from KnowledgeGraph import KnowledgeGraph
from SeekerTruther import Seeker, Truther
from RewardFunction import RewardFunction

In [2]:
load_dotenv()
textract = boto3.client(
    'textract',
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    region_name='ap-southeast-2'
)

bucket_name = 'knowledge-graph-test-examples'
document_name = 'KPMG-TL-FDD-Report-Aug-2020.pdf'

In [3]:
#response = textract.start_document_analysis(
#    DocumentLocation={'S3Object': {'Bucket': bucket_name, 'Name': document_name}},
#    FeatureTypes=['TABLES', 'LAYOUT']
#)
#
#job_id = response['JobId']
#print(f"Started Textract job with JobId: {job_id}")
#
#while True:
#    status = textract.get_document_analysis(JobId=job_id)
#    status_value = status['JobStatus']
#    if status_value in ['SUCCEEDED', 'FAILED']:
#        print(f"Job status: {status_value}")
#        break
#    print("Waiting for job to complete...")
#    time.sleep(5)
#
#all_blocks = []
#if status_value == 'SUCCEEDED':
#    next_token = None
#    while True:
#        response = textract.get_document_analysis(JobId=job_id, NextToken=next_token) if next_token else textract.get_document_analysis(JobId=job_id)
#        all_blocks.extend(response['Blocks'])
#        if 'NextToken' not in response:
#            break
#        next_token = response['NextToken']

In [4]:
#job_id = 'ff1bbae7d96c6078c61694a7b0df46bd4a37abc7595d3797c2f4d444e8e96046'
#job_id = "d15fce4acd2dba24af37f03e7d82f97cf2b0b73d2778309415f6a140b1dad0e0"

job_ids = ["b4a68e56c2764095d6e9fd5c0621ff9e4e849f03101c7b575fc500b75e5bb88d",
           "acb3d623f9aba1aa7024f941be24b2dd707c4beb1edafd685eb7709f4af2fd42"]

all_blocks = []
next_token = None
for job_id in job_ids:
    while True:
        response = textract.get_document_analysis(JobId=job_id, NextToken=next_token) if next_token else textract.get_document_analysis(JobId=job_id)
        all_blocks.extend(response['Blocks'])
        if 'NextToken' not in response: break
        next_token = response['NextToken']



In [5]:
def is_number_like(value: str) -> bool:
    """Returns True if the value is 'number-like'."""
    
    # Strip whitespace
    value = value.strip()

    # Check for empty string
    if not value:
        return False
    
    # Handle percentages: remove percentage sign and check if the remaining string is a number
    if value.endswith('%'):
        value = value[:-1]  # Remove the '%' sign
        try:
            Decimal(value)  # Try parsing as a decimal
            return True
        except InvalidOperation:
            return False

    # Handle currency symbols: remove common symbols like £, $, €, etc.
    value = value.replace('£', '').replace('$', '').replace('€', '').replace('₹', '')
    
    # Remove commas for numbers with thousands separators
    value = value.replace(',', '')
    
    # Handle 'million', 'billion', etc. (by multiplying by corresponding factors)
    multipliers = {
        'million': 1_000_000,
        'billion': 1_000_000_000,
        'thousand': 1_000,
        'k': 1_000,
        'm': 1_000_000,
        'b': 1_000_000_000,
    }
    
    # Check if the string ends with one of the multipliers
    for suffix, multiplier in multipliers.items():
        if value.lower().endswith(suffix):
            try:
                num_part = value.lower().replace(suffix, '').strip()
                num = Decimal(num_part) * multiplier
                return True
            except InvalidOperation:
                return False

    # Handle cases like "4.6x" (multiplication or "times" notation)
    if value.endswith('x') or value.endswith('X'):
        value = value[:-1]  # Remove the 'x'
        try:
            Decimal(value)  # Try parsing as a decimal number
            return True
        except InvalidOperation:
            return False

    # Attempt to parse the value as a decimal or integer
    try:
        Decimal(value)  # Try parsing as a decimal number
        return True
    except InvalidOperation:
        return False
    
def normalize_number(value: str) -> Decimal:
    value = value.strip()
    if not value:
        return Decimal('NaN')
    if value.endswith('%'):
        value = value[:-1]
        try:
            return Decimal(value) / 100
        except InvalidOperation:
            return Decimal('NaN')
    value = value.lstrip('+')
    value = value.replace('£', '').replace('$', '').replace('€', '').replace('₹', '')
    value = value.replace(',', '')
    multipliers = {
        'million': 1_000_000,
        'billion': 1_000_000_000,
        'thousand': 1_000,
        'k': 1_000,
        'm': 1_000_000,
        'b': 1_000_000_000,
    }
    for suffix, multiplier in multipliers.items():
        if value.lower().endswith(suffix):
            num_part = value.lower().replace(suffix, '').strip()
            try:
                return Decimal(num_part) * multiplier
            except InvalidOperation:
                return Decimal('NaN')
    if value.endswith('x') or value.endswith('X'):
        value = value[:-1]
        try:
            return Decimal(value)
        except InvalidOperation:
            return Decimal('NaN')
    try:
        return Decimal(value)
    except InvalidOperation:
        return Decimal('NaN')

In [None]:
kg = KnowledgeGraph(is_seeker = lambda x: is_number_like(x['Text']), is_truth = lambda x: is_number_like(x['Text']), textract_obj = all_blocks)

def seeker_and_truther_have_identical_value(seeker: Seeker, truther: Truther, knowledge_graph):
    return seeker.value == truther.value

def seeker_and_truther_have_similar_values(seeker: Seeker, truther: Truther, knowledge_graph):
    normalized_num1 = normalize_number(seeker.value)
    normalized_num2 = normalize_number(truther.value)
    return normalized_num1 == normalized_num2    

def seeker_and_truther_same_page(seeker: Seeker, truther: Truther, knowledge_graph):
    return seeker.page_num == truther.page_num

def seekers_in_same_para_truthers_in_same_table_count(count):
    def matching(seeker: Seeker, Truther: Truther):
        # Get the paragraph that the seeker is in
        # Get all the seekers that are that paragraph
        # Get how many in that paragraph are also in the table
        # If the number of ones that are similar (have another one for exact copy) is equal to count, then the match occurs
    
        
        

attribute_list = [
    seeker_and_truther_have_identical_value,
    seeker_and_truther_have_similar_values,
    seeker_and_truther_same_page
]

calculated_probabilities = kg.calculate_probabilities(attribute_list = attribute_list, labels = {"c6e2331e-6b14-4b65-bce7-100df7879e27":["3b2e21f7-d15a-4395-aeb4-0a5d39cef1f1"],"6a900196-08d9-4bed-9372-6e4ec48ff557":["ecb8e6ff-b9a1-4ba0-be42-747ea8fb0448"],"39eaf6ce-908b-4357-b011-33495abc2312":["befefef0-dcb2-4d0c-879e-98b8ea17521c"],"a7d5e164-bbd5-4ac1-a4e0-e3c7f1003032":["1af18482-1d2c-4a8e-8558-01d8f5eded51"],"97a6837d-46a2-4a07-a2f6-41a24fdc64d5":["eda780ae-0a32-4acb-8db9-5358970cd1e9"],"791f5dfd-0803-49e5-a042-f05bd13cfcea":["448d44e9-906a-408c-bb96-9612505b987d"],"c7612bc9-423c-4ac6-b097-e1c5865151f8":["3b2e21f7-d15a-4395-aeb4-0a5d39cef1f1"],"7bd524c3-ff16-48d2-a10a-a965bf2a124f":["ecb8e6ff-b9a1-4ba0-be42-747ea8fb0448"],"fe5bc0ab-5021-4f7f-8f3f-885468216175":["befefef0-dcb2-4d0c-879e-98b8ea17521c"]})
reward_function = RewardFunction(calculated_probabilities)

In [7]:
labels = {"c6e2331e-6b14-4b65-bce7-100df7879e27":["3b2e21f7-d15a-4395-aeb4-0a5d39cef1f1"],"6a900196-08d9-4bed-9372-6e4ec48ff557":["ecb8e6ff-b9a1-4ba0-be42-747ea8fb0448"],"39eaf6ce-908b-4357-b011-33495abc2312":["befefef0-dcb2-4d0c-879e-98b8ea17521c"],"a7d5e164-bbd5-4ac1-a4e0-e3c7f1003032":["1af18482-1d2c-4a8e-8558-01d8f5eded51"],"97a6837d-46a2-4a07-a2f6-41a24fdc64d5":["eda780ae-0a32-4acb-8db9-5358970cd1e9"],"791f5dfd-0803-49e5-a042-f05bd13cfcea":["448d44e9-906a-408c-bb96-9612505b987d"],"c7612bc9-423c-4ac6-b097-e1c5865151f8":["3b2e21f7-d15a-4395-aeb4-0a5d39cef1f1"],"7bd524c3-ff16-48d2-a10a-a965bf2a124f":["ecb8e6ff-b9a1-4ba0-be42-747ea8fb0448"],"fe5bc0ab-5021-4f7f-8f3f-885468216175":["befefef0-dcb2-4d0c-879e-98b8ea17521c"]}
h = {}
for x in labels:
    h[kg.nodes[x]['Text']] = [kg.nodes[kg.nodes[y]['Relationships'][0]['Ids'][0]]['Text'] for y in labels[x]]

for i,j in h.items():
    print(i, j)

+7.3% ['7.3%']
+6.4% ['6.4%']
+4.3%, ['4.3%']
+9.5%, ['9.5%']
+12.5%, ['12.5%']
+14.4%, ['14.4%']
7.3% ['7.3%']
6.4% ['6.4%']
4.3% ['4.3%']


In [8]:
calculated_probabilities

{8: 0.0, 10: 1.0, 11: 0.75}

In [9]:
[kg.nodes[i.Id]['Text'] for i in kg.seekers]

['2024',
 '2024.',
 '2024',
 '+7.3%',
 '+6.4%',
 '2024',
 '£500',
 '£1',
 '$1.25',
 '31',
 '2024',
 '1',
 '2024',
 '7.3%',
 '6.4%',
 '4.3%',
 '2']

In [10]:
for s in kg.seekers:
    for t in kg.truths:
        if seeker_and_truther_have_similar_values(s, t, kg):
            print(s.value, t.value)

+7.3% 7.3%
+6.4% 6.4%
7.3% 7.3%
6.4% 6.4%
4.3% 4.3%
4.3% 4.3%


In [13]:
for s in kg.seekers:
    print((s.parent_id, s.grandparent_id, s.closest_layout_block_id))

('c599654d-5636-4c27-8f4c-a4cacc4eaa80', '4903b275-b4a7-48f6-8088-dd4760d3c2e4', '4903b275-b4a7-48f6-8088-dd4760d3c2e4')
('031ee6bf-6696-49a1-8ca8-11ab05257fba', '4903b275-b4a7-48f6-8088-dd4760d3c2e4', '4903b275-b4a7-48f6-8088-dd4760d3c2e4')
('3e53e644-b2ff-467b-906a-5695a2bde8b9', 'd6f0f6cf-0d82-4196-949c-cf9dd7457baa', '4903b275-b4a7-48f6-8088-dd4760d3c2e4')
('e909d394-b371-4943-a6dc-17ca0d8ad794', '4903b275-b4a7-48f6-8088-dd4760d3c2e4', '4903b275-b4a7-48f6-8088-dd4760d3c2e4')
('e909d394-b371-4943-a6dc-17ca0d8ad794', '4903b275-b4a7-48f6-8088-dd4760d3c2e4', '4903b275-b4a7-48f6-8088-dd4760d3c2e4')
('85e85045-1d5a-4c6f-ae55-13e87ce873e2', 'b7323878-1699-4949-9c05-b81e7256c141', 'b7323878-1699-4949-9c05-b81e7256c141')
('88fb0b8b-139e-42c7-bb53-f3e6ac9a473a', '5474e705-4e13-4782-ae0c-9cb5045b513a', '5474e705-4e13-4782-ae0c-9cb5045b513a')
('88fb0b8b-139e-42c7-bb53-f3e6ac9a473a', '5474e705-4e13-4782-ae0c-9cb5045b513a', '5474e705-4e13-4782-ae0c-9cb5045b513a')
('57ade3ed-633e-4f62-8054-74a0d4