In [47]:
import pandas as pd 
from bs4 import BeautifulSoup
import os 
import re 
def extract_numeric_value(text):
    """Extract and normalize a numeric value"""
    if not text:
        return None
        
    # Clean the text
    text = text.replace('$', '').replace(',', '').strip()
    
    # Handle different formats of negative numbers
    if '(' in text and ')' in text:
        # Extract the number inside parentheses
        match = re.search(r'\(([\d\.]+)\)', text)
        if match:
            return f"-{match.group(1)}"
    
    # Check for negative signs
    if re.search(r'^\s*[\-−]', text):
        match = re.search(r'([\d\.]+)', text)
        if match:
            return f"-{match.group(1)}"
    
    # Normal number extraction
    match = re.search(r'([\d\.]+)', text)
    if match:
        return match.group(1)
    
    return None
    
    return None
def check_eps_pattern(text):
    """Expanded pattern checking for EPS-related text"""
    text = text.lower().strip()
    patterns = [
        r'(?:basic|diluted)?\s*earnings\s*(?:\(loss\))?\s*per\s*(?:common|outstanding)?\s*share',
        r'(?:basic|diluted)?\s*loss\s*per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*\(loss\)\s*per\s*(?:common|outstanding)?\s*share',
        r'net\s*(?:income|loss|earnings)\s*(?:attributable\s*to\s*[a-z\s]+)?\s*per\s*share',
        r'income\s*\(loss\)\s*per\s*share',
        r'\beps\b',
       # r'per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*per\s*share'
    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            # Exclude weighted average share patterns
            if re.search(r'weighted|average|shares\s*outstanding', text, re.IGNORECASE):
                continue
            return True
    
    return False
def is_basic_eps(text):
    """Check if text refers to basic EPS (not diluted)"""
    try:
        text = text.lower()
        has_basic = bool(re.search(r'\bbasic\b', text))
        has_diluted = bool(re.search(r'\bdiluted\b', text))
        return has_basic and not has_diluted
    except Exception as e:
        print(f"Error in is_basic_eps: {e}")
        return False
def is_gaap_eps(text):
    """Check if text refers to GAAP (not non-GAAP/adjusted) EPS"""
    text = text.lower()
    return not re.search(r'non-gaap|non\s*gaap|adjusted', text)


In [None]:
file_path = 'Training_Filings/0001564590-20-019726.html'
print(file_path)
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    html = f.read()
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
for table in tables:
    # Look for rows containing EPS terms
    for row in table.find_all('tr'):
        row_text = row.get_text().lower()
        if check_eps_pattern(row_text):
            # Look for cells containing a value
            cells = row.find_all('td')
            for cell in cells:
                value = extract_numeric_value(cell.get_text())
                if value is not None: 
                    ###RETURN BASIC AND DILUTED AND VALUE WE WANT THE FIRST VALUE OF A ROW I THINK 
                    print('Is basic'+ str(is_basic_eps(row_text)))
                    print(value)


Training_Filings/0001564590-20-019726.html
Is basicTrue
0.08
Is basicTrue
0.65
Is basicTrue
0.67
Is basicTrue
0.59
Is basicTrue
0.26
Is basicFalse
0.08
Is basicFalse
0.64
Is basicFalse
0.66
Is basicFalse
0.58
Is basicFalse
0.26
Is basicTrue
0.08
Is basicTrue
0.26
Is basicFalse
0.08
Is basicFalse
0.26


In [38]:
tables = soup.find_all('table')
rows = []
for table in tables:
    print('NEW TABLE')
    # Look for rows containing EPS terms
    for row in table.find_all('tr'):
        cells = [cell.text.strip() for cell in row.find_all('td')]
        print(cells)

NEW TABLE
['', '•', 'Revenue was $446.4 million, an 8.7% increase']
NEW TABLE
['', '•', 'Operating income was $38.8 million, a 35.4% increase']
NEW TABLE
['', '•', 'Operating ratio of 91.3 compared to 93.0']
NEW TABLE
['', '•', 'LTL shipments per workday rose 2.3%']
NEW TABLE
['', '•', 'LTL tonnage per workday increased by 4.0%']
NEW TABLE
['', '•', 'LTL revenue per hundredweight increased 3.1%']
NEW TABLE
['', '•', 'LTL revenue per shipment rose 4.9% to $241.61']
NEW TABLE
['Saia, Inc. and Subsidiaries', '']
['Condensed Consolidated Balance Sheets', '']
['(Amounts in thousands)', '']
['(Unaudited)', '']
['', '', '', '', '', '', '', '', '']
['', '', 'March 31, 2020', '', '', 'December 31, 2019', '']
['ASSETS', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '']
['CURRENT ASSETS:', '', '', '', '', '', '', '', '']
['Cash and cash equivalents', '', '$', '46,909', '', '', '$', '248', '']
['Accounts receivable, net', '', '', '210,894', '', '', '', '196,119', '']
['Prepaid e

In [37]:
rows

[['', '•', 'Revenue was $446.4 million, an 8.7% increase'],
 ['', '•', 'Operating income was $38.8 million, a 35.4% increase'],
 ['', '•', 'Operating ratio of 91.3 compared to 93.0'],
 ['', '•', 'LTL shipments per workday rose 2.3%'],
 ['', '•', 'LTL tonnage per workday increased by 4.0%'],
 ['', '•', 'LTL revenue per hundredweight increased 3.1%'],
 ['', '•', 'LTL revenue per shipment rose 4.9% to $241.61'],
 ['Saia, Inc. and Subsidiaries', ''],
 ['Condensed Consolidated Balance Sheets', ''],
 ['(Amounts in thousands)', ''],
 ['(Unaudited)', ''],
 ['', '', '', '', '', '', '', '', ''],
 ['', '', 'March 31, 2020', '', '', 'December 31, 2019', ''],
 ['ASSETS', '', '', '', '', '', '', '', ''],
 ['', '', '', '', '', '', '', '', ''],
 ['CURRENT ASSETS:', '', '', '', '', '', '', '', ''],
 ['Cash and cash equivalents', '', '$', '46,909', '', '', '$', '248', ''],
 ['Accounts receivable, net', '', '', '210,894', '', '', '', '196,119', ''],
 ['Prepaid expenses and other', '', '', '48,105', '', '