In [44]:
import pandas as pd 
from bs4 import BeautifulSoup
import os 
import re 
def extract_numeric_value(text):
    """Extract and normalize a numeric value, handling negative numbers in various formats"""
    if not text:
        return None
        
    # Clean the text
    cleaned_text = text.replace('$', '').replace(',', '').strip()
    
    # Handle case where opening parenthesis is present but closing one is missing
    # This happens when parentheses are split across cells
    if cleaned_text.startswith('(') and not cleaned_text.endswith(')'):
        match = re.search(r'\(?([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle normal parentheses case
    if '(' in cleaned_text and ')' in cleaned_text:
        # Extract the number inside parentheses
        match = re.search(r'\(([\d\.]+)\)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle numbers with explicit negative signs
    if re.search(r'^\s*[\-−–]', cleaned_text):  # Handle various dash characters
        match = re.search(r'([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Special case for closing parenthesis only - ignore it
    if cleaned_text == ')':
        return None
    
    # Normal number extraction
    match = re.search(r'([\d\.]+)', cleaned_text)
    if match:
        return match.group(1)
    
    return None
def check_eps_pattern(text):
    """Expanded pattern checking for EPS-related text"""
    text = text.lower().strip()
    patterns = [
        r'(?:basic|diluted)?\s*earnings\s*(?:\(loss\))?\s*per\s*(?:common|outstanding)?\s*share',
        r'(?:basic|diluted)?\s*loss\s*per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*\(loss\)\s*per\s*(?:common|outstanding)?\s*share',
        r'net\s*(?:income|loss|earnings)\s*(?:attributable\s*to\s*[a-z\s]+)?\s*per\s*share',
        r'income\s*\(loss\)\s*per\s*share',
        r'\beps\b',
       # r'per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*per\s*share'
    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            # Exclude weighted average share patterns
            if re.search(r'weighted|average|shares\s*outstanding', text, re.IGNORECASE):
                continue
            return True
    
    return False
def is_basic_eps(text):
    """Check if text refers to basic EPS (not diluted)"""
    try:
        text = text.lower()
        has_basic = bool(re.search(r'\bbasic\b', text))
        has_diluted = bool(re.search(r'\bdiluted\b', text))
        return has_basic,has_diluted
    except Exception as e:
        print(f"Error in is_basic_eps: {e}")
        return False
def is_gaap_eps(text):
    """Check if text refers to GAAP (not non-GAAP/adjusted) EPS"""
    text = text.lower()
    return not re.search(r'non-gaap|non\s*gaap|adjusted', text)


In [45]:
file_path = 'Training_Filings/0000066570-20-000013.html'
print(file_path)
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    html = f.read()
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
result = []
for table in tables:
    # Look for rows containing EPS terms
    rows = table.find_all('tr')
    for i, row in enumerate(rows):
        basic,diluted = None
        row_text = row.get_text().lower().strip()
        if check_eps_pattern(row_text):
            # Look for cells containing a value
            cells = row.find_all('td')
            print(f"Found EPS pattern in row: {row_text[:100]}...")
            for cell in cells:
                value = extract_numeric_value(cell.get_text())
                if value is not None: 
                    ###RETURN BASIC AND DILUTED AND VALUE WE WANT THE FIRST VALUE OF A ROW I THINK 
                    print(value)
                    basic,diluted = is_basic_eps(row_text)
                    eps_values.append({
                        'basic': basic,
                        'diluted': diluted,
                        'value': numeric_value
                    })
                else:
                    next_row = rows[i + 1]
                    next_cells = next_row.find_all('td')
                    print(f"No values found in current row, checking next row...")
                    for cell in next_cells:
                        value = extract_numeric_value(cell.get_text())
                        if value is not None:
                            print(f"Found value in next row: {value}")
            
                    


Training_Filings/0000066570-20-000013.html
Found EPS pattern in row: earnings per share attributable to msa safety incorporated common shareholders:...
No values found in current row, checking next row...
Found value in next row: 1.12
Found value in next row: 0.60
No values found in current row, checking next row...
Found value in next row: 1.12
Found value in next row: 0.60
No values found in current row, checking next row...
Found value in next row: 1.12
Found value in next row: 0.60
No values found in current row, checking next row...
Found value in next row: 1.12
Found value in next row: 0.60


In [77]:
file_path = 'Training_Filings/0000066570-20-000013.html'
print(file_path)

# Initialize the results list
eps_values = []

with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    html = f.read()

soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')

for table_idx, table in enumerate(tables):
    # Get all rows for sequential access
    rows = table.find_all('tr')
    
    for i, row in enumerate(rows):
        row_text = row.get_text().lower().strip()
        
        if check_eps_pattern(row_text):
            # Look for cells containing a value in current row
            cells = row.find_all('td')
            print(f"Found EPS pattern in row: {row_text[:100]}...")
            
            # Get basic/diluted classification
            basic, diluted = is_basic_eps(row_text)
            
            # Get GAAP classification
            gaap = is_gaap_eps(row_text)
            
            # Check if this row has values
            found_value = False
            row_values = []
            
            for cell in cells:
                value = extract_numeric_value(cell.get_text())
                if value is not None:
                    found_value = True
                    row_values.append({
                        'value': value,
                        'basic': basic,
                        'diluted': diluted,
                        'gaap': gaap
                    })
                    print(f"Found value in current row: {value}")
            
            # If no values found in current row, check the next row if available
            if not found_value and i + 1 < len(rows):
                next_row = rows[i + 1]
                next_cells = next_row.find_all('td')
                next_row_text = next_row.get_text().lower().strip()
                basic, diluted = is_basic_eps(next_row_text)
                gaap = is_gaap_eps(next_row_text)
                print(f"No values found in current row, checking next row...")
                
                for cell in next_cells:
                    value = extract_numeric_value(cell.get_text())
                    if value is not None:
                        row_values.append({
                            'value': value,
                            'basic': basic,
                            'diluted': diluted,
                            'gaap': gaap
                        })
                        print(f"Found value in next row: {value}")
            
            # If we found at least one value (in either current or next row), record it
            if row_values:
                # Default to the first value entry
                selected_entry = row_values[0]
                
                # First try to find basic EPS
                basic_values = [item for item in row_values if item['basic']]
                if basic_values:
                    selected_entry = basic_values[0]
                    print(f"Selected basic EPS value: {selected_entry['value']}")
                else:
                    # If no basic found, try diluted
                    diluted_values = [item for item in row_values if item['diluted']]
                    if diluted_values:
                        selected_entry = diluted_values[0]
                        print(f"No basic EPS found, using diluted: {selected_entry['value']}")
                    else:
                        print(f"No specific classification found, using first value: {selected_entry['value']}")
                
                # Extract just the values for cleaner output
                value_list = [item['value'] for item in row_values]
                
                eps_values.append({
                    'table_idx': table_idx,
                    'row_text': row_text[:100],  # Truncate for readability
                    'basic': selected_entry['basic'],
                    'diluted': selected_entry['diluted'],
                    'gaap': selected_entry['gaap'],
                    'value': selected_entry['value'],  # Prioritized value
                    'all_values': value_list
                })


Training_Filings/0000066570-20-000013.html
Found EPS pattern in row: earnings per share attributable to msa safety incorporated common shareholders:...
No values found in current row, checking next row...
Found value in next row: 1.12
Found value in next row: 0.60
Selected basic EPS value: 1.12


In [78]:
eps_values

[{'table_idx': 5,
  'row_text': 'earnings per share attributable to msa safety incorporated common shareholders:',
  'basic': True,
  'diluted': False,
  'gaap': True,
  'value': '1.12',
  'all_values': ['1.12', '0.60']}]

Unnamed: 0,0,1,2,3,4,5,6,7
26,Earnings per share attributable to MSA Safety ...,,,,,,,
27,Basic,$,1.12,,,$,0.6,
28,Diluted,$,1.11,,,$,0.59,
