In [1]:
import tatqa_utils
import pandas as pd

In [2]:
devdf = pd.read_json('dataset_raw/tatqa_dataset_dev.json')

In [3]:
for i in range(10):
    print(devdf.loc(0)[i]['table']['table'])

[['', '', 'Years Ended September 30,', ''], ['', '2019', '2018', '2017'], ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'], ['Other', '44.1', '56.7', '70.8'], ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]
[['', '', 'Fiscal', ''], ['', '2019', '2018', '2017'], ['', '', '(in millions)', ''], ['Transportation Solutions:', '', '', ''], ['Automotive', '$ 5,686', '$ 6,092', '$  5,228'], ['Commercial transportation', '1,221', '1,280', '997'], ['Sensors', '914', '918', '814'], ['Total Transportation Solutions', '7,821', '8,290', '7,039'], ['Industrial Solutions:', '', '', ''], ['Industrial equipment', '1,949', '1,987', '1,747'], ['Aerospace, defense, oil, and gas', '1,306', '1,157', '1,075'], ['Energy', '699', '712', '685'], ['Total Industrial Solutions', '3,954', '3,856', '3,507'], ['Communications Solutions:', '', '', ''], ['Data and devices', '993', '1,068', '963'], ['Appliances', '680', '774', '676'], ['Total Communications Solutions', '1,673', '1,842', '1,639'], ['Total', 

In [4]:
import re

def extract_table_data(table):
    # Helper function to check if a string contains a numeric value
    def contains_number(s):
        return bool(re.search(r'\d', s))
    
    # Identify header rows (rows with years or numeric values)
    header_rows = []
    for i, row in enumerate(table):
        if any(contains_number(cell) for cell in row):
            header_rows.append(i)
    
    if len(header_rows) < 1:
        print("Invalid table format")
        return []
    
    # Extract metadata from the header rows
    years_row = table[header_rows[0]]  # Assume the first header row with numbers is the years row
    metadata_row = table[header_rows[1]] if len(header_rows) > 1 else None  # Optional second metadata row
    category_row = table[header_rows[2]] if len(header_rows) > 2 else None  # Optional third metadata row

    years = [cell.strip() for cell in years_row if contains_number(cell)]  # Extract years
    metadata = [cell.strip() for cell in metadata_row] if metadata_row else [''] * len(years)  # Extract metadata, if present

    extracted_data = []
    
    # Iterate through the data rows below the headers
    for row in table[header_rows[-1] + 1:]:
        category = row[0].strip()  # First cell is the category
        values = row[1:]           # The rest are values
        
        # Skip empty rows
        if not any(values):
            continue
        
        # Combine category and value, adding appropriate year and metadata
        for i, (year, meta, value) in enumerate(zip(years, metadata, values)):
            value = value.strip()
            if value:  # Only add non-empty values
                extracted_data.append({
                    "Category": category,
                    "Year": year,
                    "Metadata": meta,
                    "Value": value
                })
    
    return extracted_data

In [9]:
for i in range(10):
    print(extract_table_data(devdf.loc(0)[i]['table']['table']))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [14]:
extract_table_data(devdf.loc(0)[i]['table']['table'])

[]

In [16]:
extract_header_hierarchy(devdf.loc(0)[i]['table']['table'])

([['Acquisition-related charges:',
   'Acquisition and integration costs',
   'Charges associated with the amortization of acquisition related fair value adjustments',
   'Restructuring and other charges, net',
   'Other items(1)',
   'Total'],
  ['2019', '$  27', '3', '30', '255', '17', '$  302'],
  ['Fiscal',
   '2018',
   '(in millions)',
   '$  14',
   '8',
   '22',
   '126',
   '—',
   '$  148']],
 [])

In [17]:
pd.DataFrame(devdf.loc(0)[i]['table']['table'])

Unnamed: 0,0,1,2
0,,,Fiscal
1,,2019,2018
2,,,(in millions)
3,Acquisition-related charges:,,
4,Acquisition and integration costs,$ 27,$ 14
5,Charges associated with the amortization of ac...,3,8
6,,30,22
7,"Restructuring and other charges, net",255,126
8,Other items(1),17,—
9,Total,$ 302,$ 148


In [15]:
def extract_header_hierarchy(table):
    # To store the hierarchy of headers
    horizontal_hierarchy = []
    vertical_hierarchy = []
    
    # Determine the number of rows and columns
    num_rows = len(table)
    num_cols = len(table[0]) if num_rows > 0 else 0
    
    # Extract horizontal headers (assumed to be at the top)
    horizontal_headers = []
    for row in table:
        if any(cell.strip() for cell in row):  # Consider non-empty rows
            horizontal_headers.append(row)
        else:
            break  # Stop when reaching the data rows

    # Build horizontal header hierarchy
    max_depth = len(horizontal_headers)
    for col_idx in range(num_cols):
        column_hierarchy = []
        for row_idx in range(max_depth):
            cell_value = horizontal_headers[row_idx][col_idx].strip()
            if cell_value:  # Only add non-empty values
                column_hierarchy.append(cell_value)
        horizontal_hierarchy.append(column_hierarchy)
    
    # Extract vertical headers (assumed to be in the first column or first few columns)
    for row in table[max_depth:]:  # Start after the horizontal header rows
        row_hierarchy = []
        for col_idx in range(num_cols):
            cell_value = row[col_idx].strip()
            if cell_value:  # Only add non-empty values
                row_hierarchy.append(cell_value)
            else:
                break  # Stop when empty cells are encountered in a row
        vertical_hierarchy.append(row_hierarchy)

    return horizontal_hierarchy, vertical_hierarchy

In [19]:
t = devdf.loc(0)[i]['table']['table']
t

[['', '', 'Fiscal'],
 ['', '2019', '2018'],
 ['', '', '(in millions)'],
 ['Acquisition-related charges:', '', ''],
 ['Acquisition and integration costs', '$  27', '$  14'],
 ['Charges associated with the amortization of acquisition related fair value adjustments',
  '3',
  '8'],
 ['', '30', '22'],
 ['Restructuring and other charges, net', '255', '126'],
 ['Other items(1)', '17', '—'],
 ['Total', '$  302', '$  148']]

In [32]:
import re
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))
    
nrows = len(t)
ncols =  len(t[0])

left_boundary = ncols - 1
top_boundary = nrows - 1

for i in reversed(range(0, nrows)):
    for j in reversed(range(0, ncols)):
        c = t[i][j]
        if not ( has_numbers(c) or c.strip() == '' or c == None or c == '-'):          
            break
    left_boundary = min(top_boundary, i)
    
top_boundary = 

left_boundary, top_boundary



(0, 0)

In [24]:
res = []
vhs = 1
hhs = 1
ncol = len(t[0])

for i, r in enumerate(t):
    if i == 0:
            
        print(c)
    



Fiscal

2019
2018


(in millions)
Acquisition-related charges:


Acquisition and integration costs
$  27
$  14
Charges associated with the amortization of acquisition related fair value adjustments
3
8

30
22
Restructuring and other charges, net
255
126
Other items(1)
17
—
Total
$  302
$  148


In [38]:
t =  devdf.loc(0)[0]['table']['table']
t

[['', '', 'Years Ended September 30,', ''],
 ['', '2019', '2018', '2017'],
 ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'],
 ['Other', '44.1', '56.7', '70.8'],
 ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]

In [39]:
pd.DataFrame(t)

Unnamed: 0,0,1,2,3
0,,,"Years Ended September 30,",
1,,2019,2018,2017
2,Fixed Price,"$ 1,452.4","$ 1,146.2","$ 1,036.9"
3,Other,44.1,56.7,70.8
4,Total sales,"$1,496.5","$1,202.9","$1,107.7"


In [63]:
import re

res = []
ncol = len(t[0])
hh_idxs = []
vh_idxs = []
hhs = 1
vhs = 1

def is_h_header(r):
    return r[0] == ''


def has_number(inputString):
    return bool(re.search(r'\d', inputString))
    
def is_v_header(r):
    return all([not has_number(c) for c in r])
    
for i, r in enumerate(t):
    if is_h_header(r): # 
        hh_idxs.append(i)    
        if i > 0 and (i - 1) in hh_idxs: 
            hhs = hhs + 1

tt = list(map(list, zip(*t[hhs:])))

for i, c in enumerate(tt):
    if is_v_header(c): # 
        vh_idxs.append(i)
        if i > 0 and (i - 1) in hh_idxs: 
            vhs = vhs + 1
             
hh_idxs, vh_idxs

([0, 1], [0])

In [48]:
pd.DataFrame(tt)

Unnamed: 0,0,1,2
0,Fixed Price,Other,Total sales
1,"$ 1,452.4",44.1,"$1,496.5"
2,"$ 1,146.2",56.7,"$1,202.9"
3,"$ 1,036.9",70.8,"$1,107.7"


In [57]:
[not has_number(c) for c in tt]

TypeError: expected string or bytes-like object

In [62]:
all([not has_number(c) for c in tt[0]])

True