In [31]:
%load_ext autoreload
%autoreload 2


In [16]:
import re
from typing import Optional, Tuple

def extract_number(input_str: str) -> Optional[Tuple[float, str]]:
    """
    Extracts a numeric value from a string and returns a tuple (number, other_characters).
    If no number is found, returns None.
    
    Handles integers, floats, percentages, currency, thousand separators, and scale units like 'm' (million) or 'k' (thousand).
    
    Args:
        input_str (str): The input string.
    
    Returns:
        Optional[Tuple[float, str]]: A tuple with the number and the other characters,
        or None if no number is found.
    """
    # Define a regex pattern for matching numbers
    pattern = r"""
        (?P<full_number>
            (?P<sign>[-+]?)                # Optional sign (to exclude later)
            \s*
            (?P<number>
                (\d{1,3}(?:,\d{3})*|\d+)?  # Integer with thousand separators or plain integer
                (\.\d+)?                   # Optional decimal part
                |(\d*\.\d+)                # Decimal number without leading integer
            )
            (?P<scale>[kKmM]?)             # Optional scale: k (thousand), m (million)
            (?P<percent>\s*%?)             # Optional percentage symbol
            |(?P<currency>\$\s*\d[\d,]*(?:\.\d+)?) # Optional currency
        )
    """
    
    # Compile the regex with the verbose flag
    regex = re.compile(pattern, re.VERBOSE)
    
    match = regex.search(input_str)
    if not match:
        return None
    
    full_number = match.group("full_number")
    scale = match.group("scale") or ''
    percent = match.group("percent") or ''
    currency = match.group("currency") or ''
    
    # Normalize the number by removing commas, currency, and scale
    number_str = full_number.replace(',', '').replace('$', '').strip('%').strip(scale)
    
    try:
        number = float(number_str)
    except ValueError:
        return None

    # Adjust number based on scale
    scale = scale.lower()
    if scale == 'k':
        number *= 1_000
    elif scale == 'm':
        number *= 1_000_000

    # Construct the other_characters string
    other_characters = f"{percent}{scale}{currency}".strip()
    
    return number, other_characters


In [17]:
table = [['', 'Domestic', '', 'International', ''],
['', 'September 30,', '', 'September 30,', ''],
['', '2019', '2018', '2019', '2018'],
['Discount rate', '4.00%', '3.75%', '1.90%', '2.80%'],
['Expected return on plan assets', '', '', '3.40%', '3.70%'],
['Rate of compensation increase', '', '', '- - %', '- - %']]


In [18]:
def fill_column_headers(row):    
    if row[0] != '':
        return row
    col_num = len([c for c in row if c!=''])
    step_size = int((len(row) - 1) / col_num)    
    for i in range(col_num):
        col_name = None
        for j in range(step_size):
            if row[1+i*step_size] != '':
                col_name = row[1+i*step_size]
                break
        for j in range(step_size):
            row[1+i*step_size + j] = col_name
    return row

for idx, r in enumerate((table)):
    print(fill_column_headers(r))

['', 'Domestic', 'Domestic', 'International', 'International']
['', 'September 30,', 'September 30,', 'September 30,', 'September 30,']
['', '2019', '2018', '2019', '2018']
['Discount rate', '4.00%', '3.75%', '1.90%', '2.80%']
['Expected return on plan assets', '', '', '3.40%', '3.70%']
['Rate of compensation increase', '', '', '- - %', '- - %']


In [19]:
def fill_table_headers(table):
    first_value_col_idx = 1
    
    for idx, r in enumerate(reversed(table)):
        if r[0] == '': ## header end
            first_value_row_idx = idx
            break        
    for idx, r in enumerate(reversed(table[0:first_value_row_idx])):
       table[idx] = fill_column_headers(table[idx])    
        
    return (table, first_value_col_idx, first_value_row_idx)
    
fill_table_headers(table)

([['', 'Domestic', 'Domestic', 'International', 'International'],
  ['', 'September 30,', 'September 30,', 'September 30,', 'September 30,'],
  ['', '2019', '2018', '2019', '2018'],
  ['Discount rate', '4.00%', '3.75%', '1.90%', '2.80%'],
  ['Expected return on plan assets', '', '', '3.40%', '3.70%'],
  ['Rate of compensation increase', '', '', '- - %', '- - %']],
 1,
 3)

In [25]:
def convert_table(table):
    (table, first_value_col_idx, first_value_row_idx) = fill_table_headers(table)
    res = []
    for i in range(first_value_row_idx, len(table)):        
        for j in range(first_value_col_idx, len(table[0])):
            r = extract_number(table[i][j])
            if r is None:
                continue
            (number, other_chars) = r
            
            upper_heads = []
            for ih in reversed(range( 0, first_value_row_idx)):
                upper_heads.append(table[ih][j])

            left_heads = []
            for jh in reversed(range( 0, first_value_col_idx)):
                left_heads.append(table[i][jh])
                        
            res.append({'number_value': number, 'scale': other_chars, 'metadata': upper_heads + left_heads  })
    return res
convert_table(table)

[{'number_value': 4.0,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 3.75,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 1.9,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 2.8,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 3.4,
  'scale': '%',
  'metadata': ['2019',
   'September 30,',
   'International',
   'Expected return on plan assets']},
 {'number_value': 3.7,
  'scale': '%',
  'metadata': ['2018',
   'September 30,',
   'International',
   'Expected return on plan assets']}]

In [27]:
import table_convert

table_convert.convert_table(table)

[{'number_value': 4.0,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 3.75,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 1.9,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 2.8,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 3.4,
  'scale': '%',
  'metadata': ['2019',
   'September 30,',
   'International',
   'Expected return on plan assets']},
 {'number_value': 3.7,
  'scale': '%',
  'metadata': ['2018',
   'September 30,',
   'International',
   'Expected return on plan assets']}]

In [71]:
table = [['', '', 'Years Ended September 30,', ''],
 ['', '2019', '2018', '2017'],
 ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'],
 ['Other', '44.1', '56.7', '70.8'],
 ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]

table_convert.convert_table(table)


i: 2
1
NaN
2
NaN
3
NaN
i: 3
1
2
3
i: 4
1
NaN
2
NaN
3
NaN


[{'number_value': 44.1,
  'scale': '',
  'metadata': ['2019', 'Years Ended September 30,', 'Other']},
 {'number_value': 56.7,
  'scale': '',
  'metadata': ['2018', 'Years Ended September 30,', 'Other']},
 {'number_value': 70.8,
  'scale': '',
  'metadata': ['2017', 'Years Ended September 30,', 'Other']}]

In [52]:
table_convert.convert_table(table)

[{'number_value': 4.0,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 3.75,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'Domestic', 'Discount rate']},
 {'number_value': 1.9,
  'scale': '%',
  'metadata': ['2019', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 2.8,
  'scale': '%',
  'metadata': ['2018', 'September 30,', 'International', 'Discount rate']},
 {'number_value': 3.4,
  'scale': '%',
  'metadata': ['2019',
   'September 30,',
   'International',
   'Expected return on plan assets']},
 {'number_value': 3.7,
  'scale': '%',
  'metadata': ['2018',
   'September 30,',
   'International',
   'Expected return on plan assets']}]

In [64]:
table

[['', '', 'Years Ended September 30,', ''],
 ['', '2019', '2018', '2017'],
 ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'],
 ['Other', '44.1', '56.7', '70.8'],
 ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]

In [51]:
table = [['', 'Domestic', '', 'International', ''],
['', 'September 30,', '', 'September 30,', ''],
['', '2019', '2018', '2019', '2018'],
['Discount rate', '4.00%', '3.75%', '1.90%', '2.80%'],
['Expected return on plan assets', '', '', '3.40%', '3.70%'],
['Rate of compensation increase', '', '', '- - %', '- - %']]
table_convert.fill_table_headers(table)

([['', 'Domestic', 'Domestic', 'International', 'International'],
  ['', 'September 30,', 'September 30,', 'September 30,', 'September 30,'],
  ['', '2019', '2018', '2019', '2018'],
  ['Discount rate', '4.00%', '3.75%', '1.90%', '2.80%'],
  ['Expected return on plan assets', '', '', '3.40%', '3.70%'],
  ['Rate of compensation increase', '', '', '- - %', '- - %']],
 1,
 3)

In [76]:
table_convert.extract_number("Revenue is $1.2m this year.")

In [93]:
print(extract_number("Revenue is $1,234,567.89m this year."))
# Output: (1234567890.0, "m")

print(extract_number("Profit: 25,500.75k%"))  
# Output: (25500750.0, "k%")

print(extract_number("Loss: -1,500.25k"))  
# Output: (-1500250.0, "k")

print(extract_number("Gain: +3,200,000.50m"))  
# Output: (3200000500.0, "m")

print(extract_number("Invalid string"))  
# Output: None



(1234567.89, 'm')
(25500.75, '%k')
(-1500.25, 'k')
(3200000.5, 'm')
None


In [92]:
import re
from typing import Optional, Tuple

def extract_number(input_str: str) -> Optional[Tuple[float, str]]:
    """
    Extracts a numeric value from a string and returns a tuple (number, other_characters).
    If no number is found, returns None.
    
    Handles integers, floats, percentages, currency, thousand separators, and scale units like 'm' (million) or 'k' (thousand).
    
    Args:
        input_str (str): The input string.
    
    Returns:
        Optional[Tuple[float, str]]: A tuple with the number and the other characters,
        or None if no number is found.
    """
    # Define a regex pattern for matching numbers
    pattern = r"""
        (?P<number>
            [-+]?                        # Optional sign
            (?:
                \d{1,3}(?:,\d{3})*       # Integer with thousand separators
                (?:\.\d+)?               # Optional decimal part
                |
                \d+\.\d+                 # Decimal number without thousand separators
            )
        )
        (?P<scale>[kKmM]?)               # Optional scale: k (thousand), m (million)
        (?P<percent>\s*%?)               # Optional percentage symbol
    """
    
    # Compile the regex with the verbose flag
    regex = re.compile(pattern, re.VERBOSE)
    
    match = regex.search(input_str)
    if not match:
        return None
    
    number_str = match.group("number").replace(',', '')  # Remove commas from the number
    scale = match.group("scale") or ''
    percent = match.group("percent") or ''
    
    try:
        number = float(number_str)
    except ValueError:
        return None
    
    # Adjust number based on scale
    #scale = scale.lower()
    #if scale == 'k':
    #    number *= 1_000
    #elif scale == 'm':
    #    number *= 1_000_000

    # Construct the other_characters string
    other_characters = f"{percent}{scale}".strip()
    
    return number, other_characters
