In [1]:
import json
import os

import pandas as pd
import pdfplumber

os.getcwd()

'c:\\Users\\joost\\ReposWindows\\bedrock-web\\sandbox'

In [2]:
df1 = pd.read_csv('ags3_groups_and_headings.tsv', sep='\t')
df2 = pd.read_csv('ags3_groups_and_headings2.tsv', sep='\t')

if df1.equals(df2):
    print("The files are identical.")
else:
    print("The files are different.")
    print(df1.compare(df2))

The files are different.


ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [4]:
df1

Unnamed: 0,group_name,contents,parent_group
0,ABBR,Abbreviation Definitions,-
1,BKFL,Backfill Details,HOLE
2,CBRG,CBR Test - General,SAMP
3,CBRT,CBR Test,CBRG
4,CDIA,Casing Diameter by Depth,HOLE
...,...,...,...
69,TRIG,Triaxial Test - General,SAMP
70,TRIX,Triaxial Test,TRIG
71,UNIT,Definition of <UNITS> and CNMT_UNIT,-
72,WETH,Weathering Grades,HOLE


In [5]:
df2

Unnamed: 0,group_name,contents,parent_group
0,ABBR,Abbreviation Definitions,-
1,BKFL,Backfill Details,HOLE
2,CBRG,CBR Test - General,SAMP
3,CBRT,CBR Test,CBRG
4,CDIA,Casing Diameter by Depth,HOLE
...,...,...,...
70,TRIG,Triaxial Test - General,SAMP
71,TRIX,Triaxial Test,TRIG
72,UNIT,Definition of Units,-
73,WETH,Weathering Grades,HOLE


In [3]:
df1.compare(df2)

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [8]:
# Concatenate the two DataFrames
df_concat = pd.concat([df1, df2])

# Find the duplicate rows
duplicates = df_concat.duplicated(keep=False)

# Find the rows that are not duplicates (i.e., the rows that are unique to one DataFrame)
unique_rows = df_concat[~duplicates]

In [9]:
unique_rows

Unnamed: 0,group_name,contents,parent_group
32,HPGI,Horizontal Profile Gauge Installation Details,HOLE
37,IFID,On Site Volatile Headspace Testing Using Flame...,HOLE
38,INST,Single Point Instrument Installation Details,HOLE
71,UNIT,Definition of <UNITS> and CNMT_UNIT,-
32,HPGI,Horizontal Profile Gauge Installation,HOLE
37,IFID,On Site Volatile Headspace Testing (FID),HOLE
38,INST,Single Point Instrument Installation,HOLE
40,IPID,On Site Volatile Headspace Testing (PID),HOLE
72,UNIT,Definition of Units,-


In [9]:
def extract_ags3_data_dict_table(table):
    headings = []
    for row in table[2:]:  # Skip first 2 rows: 1st = title, 2nd = headings
        headings.append(
            {
                "status": None if row[0] == "" else row[0].strip(),
                "heading": row[1].strip(),
                "unit": None if row[2] == "" else row[2].strip().replace("\n", " "),
                "description": row[3].strip().replace("\n", " "),
                "example": None if row[4] == "" else row[4].strip().replace("\n", " "),
            }
        )
    return headings


def extract_ags4_data_dict_table(table):
    # Skip rows that don't contain data
    for i, row in enumerate(table):
        if "Suggested\nUnit / Type" in row or "Unit / Type" in row:
            first_data_row = i + 1
            break

    headings = []
    for row in table[first_data_row:]:
        row = [x for x in row if x is not None]
        headings.append(
            {
                "status": None if row[0] == "" else row[0].strip(),
                "heading": row[1].strip(),
                "unit": None if row[2] == "" else row[2].strip().replace("\n", ""),
                "type": row[3].strip(),
                "description": row[4].strip().replace("\n", " "),
                "example": None if row[5] == "" else row[5].strip().replace("\n", " "),
            }
        )
    return headings

In [10]:
ags_version = 3  # AGS version 3 or 4
pdf_dict = {
    3: {"pdf_file": "AGS3_v3-1-2005.pdf", "from_page": 22, "to_page": 68},
    4: {"pdf_file": "AGS4-v4-1-1-2022.pdf", "from_page": 18, "to_page": 159},
}


In [11]:
pdf_file, from_page, to_page = pdf_dict[ags_version].values()

# List to store extracted data for each group
extracted_data = []
previous_group_name = ""
with pdfplumber.open(pdf_file) as pdf:
    # Adjust the page range based on where the tables are located
    for page_number in range(from_page, to_page):
        page = pdf.pages[page_number - 1]  # pdfplumber is 0-based, so subtract 1
        tables_on_current_page = page.extract_tables()  # Extract tables from the page

        # Iterate through all tables found on the page
        for table in tables_on_current_page:
            if ags_version == 3:
                table_title = table[0][0].strip()  # Get table title from AGS3
            elif ags_version == 4:
                table_title = table[0][1].strip()  # Get table title from AGS4
            print(table_title)

            parts = table_title.split(": ", 1)  # Split on the first occurrence of ': '
            if "Group Name" in parts[0]:
                group_name = parts[1].split(" - ")[0]
                group_description = " - ".join(parts[1].split(" - ")[1:])
                group_description = group_description.replace("\n", " ")
                if ags_version == 3:
                    headings = extract_ags3_data_dict_table(table)
                elif ags_version == 4:
                    headings = extract_ags4_data_dict_table(table)

                if group_name == previous_group_name:
                    extracted_data[-1]["headings"].extend(headings)
                else:
                    extracted_data.append(
                        {
                            "group_name": group_name,
                            "group_description": group_description,
                            "headings": headings,
                        }
                    )
                previous_group_name = group_name


Group Name : PROJ - Project Information
Group Name : ABBR - Abbreviation Definitions
Group Name : ?BKFL - Backfill Details
Group Name : CBRG - CBR Test – General
Group Name : CBRT - CBR Test
Group Name : CDIA - Casing Diameter by Depth
Group Name : CHEM - Chemical Tests
N.B. Provision for reporting of this data is included in groups CNMT and ?ICCT. These groups should be used as
appropriate in preference.
Group CHEM will be deleted from future editions.
Group Name : CHIS - Chiselling Details
Group Name : CHLK - Chalk Tests
Group Name : CLSS - Classification Tests
Group Name : CMPG - Compaction Tests - General
Group Name : CMPT - Compaction Tests
Group Name : CNMT - Contaminant and Chemical Testing
NB. in situ measurement and monitoring of contamination and chemicals shoud be recorded in Group ?ICCT
Group Name : CODE - Chemical Testing Codes
Group Name : CONG - Consolidation Test - General
Group Name : CONS - Consolidation Test
Group Name : CORE - Rotary Core Information
Group Name : DE

In [12]:
# Save the extracted data to a JSON file
with open(f"ags{ags_version}_data_dict_p{from_page}-{to_page}.json", "w") as json_file:
    json.dump(extracted_data, json_file, indent=2)

In [13]:
f"The number of groups in the AGS{ags_version} data dictionary is {len(extracted_data)}"

'The number of groups in the AGS3 data dictionary is 73'

In [10]:
extracted_data

[{'group_name': 'RELD',
  'group_description': 'Relative Density Test',
  'headings': [{'status': '*',
    'heading': 'HOLE_ID',
    'unit': None,
    'description': 'Exploratory hole or location equivalent',
    'example': '6431/A'},
   {'status': '*',
    'heading': 'SAMP_TOP',
    'unit': 'm',
    'description': 'Depth to TOP of test sample',
    'example': '8.50'},
   {'status': '*',
    'heading': 'SAMP_REF',
    'unit': None,
    'description': 'Sample reference number',
    'example': '16'},
   {'status': '*',
    'heading': 'SAMP_TYPE',
    'unit': None,
    'description': 'Sample type',
    'example': 'LB (See Appendix 1)'},
   {'status': '*',
    'heading': 'SPEC_REF',
    'unit': None,
    'description': 'Specimen reference number',
    'example': '2'},
   {'status': '*',
    'heading': 'SPEC_DPTH',
    'unit': 'm',
    'description': 'Specimen depth',
    'example': '8.50'},
   {'status': None,
    'heading': 'RELD_REM',
    'unit': None,
    'description': 'Method of test'