In [1]:
import json

# Define the filename for the JSON file
json_filename = 'pdf_restric.json'

# Read the JSON file into a variable
with open(json_filename, 'r') as json_file:
    pdf_restric_loaded = json.load(json_file)

# Print the loaded dictionary to verify
pdf_restric_loaded

{'Abrasives': [33, 34],
 'Aluminum': [35, 36],
 'Antimony': [37, 38],
 'Arsenic': [39, 40],
 'Asbestos': [41, 42],
 'Barite': [43, 44]}

In [2]:
import pdfplumber

def extract_word_positions(pdf_path, pages):
    word_positions = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_number in pages:
            page = pdf.pages[page_number]
            words = page.extract_words()
            for word in words:
                word_info = {
                    'text': word['text'],
                    'x0': word['x0'],
                    'top': word['top'],
                    'x1': word['x1'],
                    'bottom': word['bottom'],
                    'page_number': page_number + 1  # to reflect the actual page number
                }
                word_positions.append(word_info)
    
    return word_positions

def extract_word_positions(pdf_path, page_number):
    word_positions = []
    
    with pdfplumber.open(pdf_path) as pdf:
        # The page number is directly used here without adjusting for zero-based index
        page = pdf.pages[page_number - 1]
        words = page.extract_words()
        # for word in words:
        #     word_info = {
        #         'text': word['text'],
        #         'x0': word['x0'],
        #         'top': word['top'],
        #         'x1': word['x1'],
        #         'bottom': word['bottom']
        #     }
        #     word_positions.append(word_info)
    
    return words

def extract_positions_for_elements(pdf_path, elements_pages):
    element_positions = {}
    
    for element, pages in elements_pages.items():
        # Process the second page for each element (index 1, which is the second page)
        second_page_number = pages[1] + 1
        positions = extract_word_positions(pdf_path, second_page_number)
        element_positions[element] = {
            'element_name': element,
            'pages': pages,
            'content': positions
        }
    
    return element_positions

In [4]:
pdf_path = 'mcs2024.pdf'

In [5]:
extract_word_positions(pdf_path, 33)

[{'text': '29',
  'x0': 567.1029897,
  'x1': 575.9989896999999,
  'top': 27.97199999999998,
  'doctop': 23030.30556,
  'bottom': 35.97199999999998,
  'upright': True,
  'height': 8.0,
  'width': 8.895999999999958,
  'direction': 'ltr'},
 {'text': 'Figure',
  'x0': 76.9265,
  'x1': 119.62002200000002,
  'top': 49.28408000000002,
  'doctop': 23051.61764,
  'bottom': 63.264080000000035,
  'upright': True,
  'height': 13.980000000000018,
  'width': 42.693522000000016,
  'direction': 'ltr'},
 {'text': '14.—Relation',
  'x0': 123.42677600000002,
  'x1': 212.00825,
  'top': 49.28408000000002,
  'doctop': 23051.61764,
  'bottom': 63.264080000000035,
  'upright': True,
  'height': 13.980000000000018,
  'width': 88.58147399999999,
  'direction': 'ltr'},
 {'text': 'Between',
  'x0': 215.70596,
  'x1': 273.272804,
  'top': 49.28408000000002,
  'doctop': 23051.61764,
  'bottom': 63.264080000000035,
  'upright': True,
  'height': 13.980000000000018,
  'width': 57.566844,
  'direction': 'ltr'},
 {'te

In [19]:
positions = extract_positions_for_elements(pdf_path, pdf_restric_loaded)

In [20]:
positions['Abrasives']

{'element_name': 'Abrasives',
 'pages': [33, 34],
 'content': [{'text': '31',
   'x0': 567.1029897,
   'top': 27.97199999999998,
   'x1': 575.9989896999999,
   'bottom': 35.97199999999998},
  {'text': 'ABRASIVES',
   'x0': 214.32,
   'top': 37.452,
   'x1': 284.30400000000003,
   'bottom': 49.452},
  {'text': '(MANUFACTURED)',
   'x0': 287.64,
   'top': 37.452,
   'x1': 397.596,
   'bottom': 49.452},
  {'text': 'Depletion',
   'x0': 45.36504000000002,
   'top': 60.711000000000126,
   'x1': 90.97507800000002,
   'bottom': 70.73100000000011},
  {'text': 'Allowance:',
   'x0': 93.73458600000002,
   'top': 60.711000000000126,
   'x1': 146.56303199999996,
   'bottom': 70.73100000000011},
  {'text': 'None.',
   'x0': 149.27244000000002,
   'top': 60.199980000000096,
   'x1': 175.99978800000002,
   'bottom': 70.21998000000008},
  {'text': 'Government',
   'x0': 45.36,
   'top': 82.66751999999997,
   'x1': 104.318682,
   'bottom': 92.68751999999995},
  {'text': 'Stockpile:',
   'x0': 107.04211

In [21]:
import json

# Define the filename for the JSON file
json_filename = 'abrassive_test.json'

# Write the pdf_restric dictionary to a JSON file
with open(json_filename, 'w') as json_file:
    json.dump(positions['Abrasives'], json_file, indent=4)

print(f"Data saved to {json_filename}")

Data saved to abrassive_test.json


### Detect table based on coordinates

In [26]:
import json

# Define the filename for the JSON file
json_filename = 'abrassive_test.json'

# Read the JSON file into a variable
with open(json_filename, 'r') as json_file:
    test_abrassive = json.load(json_file)

# Print the loaded dictionary to verify
test_abrassive

{'element_name': 'Abrasives',
 'pages': [33, 34],
 'content': [{'text': '31',
   'x0': 567.1029897,
   'top': 27.97199999999998,
   'x1': 575.9989896999999,
   'bottom': 35.97199999999998},
  {'text': 'ABRASIVES',
   'x0': 214.32,
   'top': 37.452,
   'x1': 284.30400000000003,
   'bottom': 49.452},
  {'text': '(MANUFACTURED)',
   'x0': 287.64,
   'top': 37.452,
   'x1': 397.596,
   'bottom': 49.452},
  {'text': 'Depletion',
   'x0': 45.36504000000002,
   'top': 60.711000000000126,
   'x1': 90.97507800000002,
   'bottom': 70.73100000000011},
  {'text': 'Allowance:',
   'x0': 93.73458600000002,
   'top': 60.711000000000126,
   'x1': 146.56303199999996,
   'bottom': 70.73100000000011},
  {'text': 'None.',
   'x0': 149.27244000000002,
   'top': 60.199980000000096,
   'x1': 175.99978800000002,
   'bottom': 70.21998000000008},
  {'text': 'Government',
   'x0': 45.36,
   'top': 82.66751999999997,
   'x1': 104.318682,
   'bottom': 92.68751999999995},
  {'text': 'Stockpile:',
   'x0': 107.04211

In [60]:
word_positions = test_abrassive['content']

word_positions

[{'text': '31',
  'x0': 567.1029897,
  'top': 27.97199999999998,
  'x1': 575.9989896999999,
  'bottom': 35.97199999999998},
 {'text': 'ABRASIVES',
  'x0': 214.32,
  'top': 37.452,
  'x1': 284.30400000000003,
  'bottom': 49.452},
 {'text': '(MANUFACTURED)',
  'x0': 287.64,
  'top': 37.452,
  'x1': 397.596,
  'bottom': 49.452},
 {'text': 'Depletion',
  'x0': 45.36504000000002,
  'top': 60.711000000000126,
  'x1': 90.97507800000002,
  'bottom': 70.73100000000011},
 {'text': 'Allowance:',
  'x0': 93.73458600000002,
  'top': 60.711000000000126,
  'x1': 146.56303199999996,
  'bottom': 70.73100000000011},
 {'text': 'None.',
  'x0': 149.27244000000002,
  'top': 60.199980000000096,
  'x1': 175.99978800000002,
  'bottom': 70.21998000000008},
 {'text': 'Government',
  'x0': 45.36,
  'top': 82.66751999999997,
  'x1': 104.318682,
  'bottom': 92.68751999999995},
 {'text': 'Stockpile:',
  'x0': 107.04211800000002,
  'top': 82.66751999999997,
  'x1': 154.89262799999997,
  'bottom': 92.68751999999995},

### test using gutt