In [91]:
import pandas as pd 
from bs4 import BeautifulSoup
import os 
import re

def extract_numeric_value(text):
    """
    Extract and normalize a numeric value, handling negative numbers in various formats.
    
    Args:
        text: Text containing a potential numeric value
        
    Returns:
        Normalized numeric value as a string, or None if no valid number found
    """
    if not text:
        return None
        
    # Clean the text
    cleaned_text = text.replace('$', '').replace(',', '').strip()
    
    # Handle case where opening parenthesis is present but closing one is missing
    # This happens when parentheses are split across cells
    if cleaned_text.startswith('(') and not cleaned_text.endswith(')'):
        match = re.search(r'\(?([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle normal parentheses case
    if '(' in cleaned_text and ')' in cleaned_text:
        # Extract the number inside parentheses
        match = re.search(r'\(([\d\.]+)\)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle numbers with explicit negative signs
    if re.search(r'^\s*[\-−–]', cleaned_text):  # Handle various dash characters
        match = re.search(r'([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Special case for closing parenthesis only - ignore it
    if cleaned_text == ')':
        return None
    
    # Normal number extraction
    match = re.search(r'([\d\.]+)', cleaned_text)
    if match:
        return match.group(1)
    
    return None

def check_eps_pattern(text):
    """
    Check if text contains patterns indicating EPS (Earnings Per Share) information.
    
    Args:
        text: Text to check for EPS patterns
        
    Returns:
        Boolean indicating if an EPS pattern was found
    """
    text = text.lower().strip()
    patterns = [
        r'(?:basic|diluted)?\s*earnings\s*(?:\(loss\))?\s*per\s*(?:common|outstanding|ordinary)?\s*share',
        r'(?:basic|diluted)?\s*loss\s*per\s*(?:common|outstanding|ordinary)?\s*share',
        r'earnings\s*\(loss\)\s*per\s*(?:common|outstanding|ordinary)?\s*share',
        r'net\s*(?:income|loss|earnings)\s*(?:attributable\s*to\s*[a-z\s]+)?\s*per\s*share',
        r'income\s*\(loss\)\s*per\s*share',
        r'\beps\b',
        r'earnings\s*per\s*share',
        r'net\s+income\s+available\s+to\s+common\s+stockholders\s+per\s+share',
        r'net\s+income\s+per\s+common\s+share',
        r'net\s*(?:\(loss\)\s*income|income\s*\(loss\))\s*per\s*share'

    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            # Exclude weighted average share patterns
            if re.search(r'weighted|average|shares\s*outstanding', text, re.IGNORECASE):
                continue
            return True
    
    return False

def is_basic_eps(text):
    """
    Check if text refers to basic EPS, diluted EPS, or both.
    
    Args:
        text: Text to check for basic/diluted indicators
        
    Returns:
        Tuple of (is_basic, is_diluted) booleans
    """
    try:
        text = text.lower()
        has_basic = bool(re.search(r'\bbasic\b', text))
        has_diluted = bool(re.search(r'\bdiluted\b', text))
                # Special case: "basic and diluted" or "basic & diluted" should count as both
        if re.search(r'basic\s+(?:and|&)\s+diluted', text) or re.search(r'diluted\s+(?:and|&)\s+basic', text):
            has_basic = True
            has_diluted = True
            print('HERE IN basic')
        return has_basic, has_diluted
    except Exception as e:
        print(f"Error in is_basic_eps: {e}")
        return False, False

def is_gaap_eps(text):
    """
    Check if text refers to GAAP (not non-GAAP/adjusted) EPS.
    
    Args:
        text: Text to check for GAAP/non-GAAP indicators
        
    Returns:
        Boolean indicating if EPS is GAAP (True) or non-GAAP (False)
    """
    text = text.lower()
    return not re.search(r'non-gaap|non\s*gaap|adjusted', text)

def select_eps_value(row_values, row_text, table_idx):
    """
    Select the appropriate EPS value based on priority rules and create the final EPS entry.
    
    Args:
        row_values: List of dictionaries containing EPS values and classifications
        row_text: Text from the row where EPS pattern was found
        table_idx: Table index for reference
        
    Returns:
        Dictionary with selected EPS information
    """
    if not row_values:
        return None
        
    # Default to the first value entry
    selected_entry = row_values[0]
    
    # First try to find basic EPS (highest priority)
    basic_values = [item for item in row_values if item['basic']]
    if basic_values:
        selected_entry = basic_values[0]
    else:
        # If no basic found, try diluted
        diluted_values = [item for item in row_values if item['diluted']]
        if diluted_values:
            selected_entry = diluted_values[0]
    
    # Extract just the values for cleaner output
    value_list = [item['value'] for item in row_values]
    
    # Create the final EPS entry
    return {
        'table_idx': table_idx,
        'row_text': row_text[:100],  # Truncate for readability
        'basic': selected_entry['basic'],
        'diluted': selected_entry['diluted'],
        'gaap': selected_entry['gaap'],
        'value': selected_entry['value'],  # Prioritized value
        'all_values': value_list
    }

def extract_eps_from_filing(file_path, verbose=False):
    """
    Extract EPS values from an HTML financial filing.
    
    Args:
        file_path: Path to the HTML filing
        verbose: Whether to print detailed information during extraction
        
    Returns:
        List of dictionaries containing extracted EPS information
    """
    if verbose:
        print(f"Processing file: {file_path}")
    
    # Initialize the results list
    eps_values = []
    
    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        html = f.read()
    
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    
    for table_idx, table in enumerate(tables):
        # Get all rows for sequential access
        rows = table.find_all('tr')
        
        for i, row in enumerate(rows):
            row_text = row.get_text().lower().strip().replace(':', '')
            
            if check_eps_pattern(row_text):
                # Look for cells containing a value in current row
                cells = row.find_all('td')
                if verbose:
                    print(f"Found EPS pattern in row: {row_text[:100]}...")
                
                # Get basic/diluted classification
                basic, diluted = is_basic_eps(row_text)
                
                # Get GAAP classification
                gaap = is_gaap_eps(row_text)
                
                # Check if this row has values
                found_value = False
                row_values = []
                
                for cell in cells:
                    cell_text = cell.get_text().strip()
                    
                    value = extract_numeric_value(cell_text)
    
                    if value is not None:
                        found_value = True
                        
                        row_values.append({
                            'value': value,
                            'basic': basic,
                            'diluted': diluted,
                            'gaap': gaap
                        })
                        
                        if verbose:
                            print(f"Found value in current row: {value}")
                
                # If no values found in current row or we suspect partial parentheses, check the next row
                while (not found_value and i + 1 < len(rows)):
                    next_row = rows[i + 1]
                    next_cells = next_row.find_all('td')
                    next_row_text = next_row.get_text().lower().strip().replace(':', '')
                    
                    # Get classifications from next row
                    if not(basic and diluted):
                        basic, diluted = is_basic_eps(next_row_text)
                    gaap = is_gaap_eps(next_row_text)
                    
                    if verbose:
                        print(f"Checking next row for values...")
                        print(next_row_text)
                    for cell in next_cells:
                        cell_text = cell.get_text().strip()
                        value = extract_numeric_value(cell_text)
                        
                        if value is not None:
                            found_value = True
                            row_values.append({
                                'value': value,
                                'basic': basic,
                                'diluted': diluted,
                                'gaap': gaap
                            })
                            
                            if verbose:
                                print(f"Found value in next row: {value}")
                    i+=1
                # If we found at least one value, use helper method to select and create entry
                if row_values:
                    eps_entry = select_eps_value(row_values, row_text, table_idx)
                    eps_values.append(eps_entry)
    
    return eps_values


def select_final_eps(eps_values):
    """
    Select the final EPS value from all extracted values based on row text patterns and priority rules.
    
    Args:
        eps_values: List of dictionaries containing extracted EPS information
        
    Returns:
        Single EPS value or None if no valid value found
    """
    if not eps_values:
        return None
    
    # If there's only one value, return it directly
    if len(eps_values) == 1:
        return eps_values[0]['value']
    
    # Define pattern priority list (highest priority first)
    # Each entry is (regex pattern, score adjustment)
    pattern_priorities = [
        # Basic net income/earnings per share (highest priority)
        (r'basic\s+(?:and|&)\s+diluted\s+(?:loss|earnings|income)\s+per\s+share', 1200),
        (r'(?:loss|earnings|income)\s+per\s+share\s+[-–]\s+basic\s+(?:and|&)\s+diluted', 1200),
        (r'basic.*net\s+(?:income|earnings).*per\s+share', 1000),
        
        # Basic EPS patterns (very high priority)
        (r'basic\s+earnings\s+per\s+(?:common\s+)?share', 900),
        (r'earnings\s+per\s+(?:common\s+)?share.*basic', 900),
        (r'net\s+income.*per\s+(?:common\s+)?share.*basic', 900),
        
        # Important: Add patterns for "earnings (loss)" format (high priority)
        (r'earnings\s*\(loss\)\s*per\s+(?:common\s+)?share', 850),
        (r'net\s+earnings\s*\(loss\)\s*per\s+share', 850),
        (r'\(loss\)\s*earnings\s*per\s+(?:common\s+)?share', 850),
        
        # Income/earnings per share (likely basic if not specified)
        (r'net\s+(?:income|earnings).*per\s+share', 800),
        (r'net\s+income\s+per\s+common\s+share', 800),  # High priority
        (r'income.*per\s+share', 700),
        (r'earnings\s+per\s+(?:common\s+)?share', 700),
        
        # Loss per share (still high priority)
        (r'basic.*loss\s+per\s+share', 650),
        (r'net\s+loss.*per\s+share', 650),
        (r'loss\s+per\s+share', 650),
        
        # GAAP EPS terms
        (r'gaap.*earnings\s+per\s+share', 600),
        
        # Specific types of basic EPS
        (r'basic\s+.*per\s+share', 550),
        
        # Diluted net income/earnings per share (medium priority)
        (r'diluted.*net\s+(?:income|earnings).*per\s+share', 500),
        
        # Diluted EPS patterns
        (r'diluted\s+earnings\s+per\s+(?:common\s+)?share', 400),
        (r'earnings\s+per\s+(?:common\s+)?share.*diluted', 400),
        
        # Generic diluted patterns
        (r'diluted\s+.*per\s+share', 300),
        
        # Non-GAAP or adjusted terms (lower priority)
        (r'adjusted\s+(?:basic\s+)?earnings\s+per\s+share', 200),
        (r'non-gaap.*earnings\s+per\s+share', 100),
        (r'adjusted', 50),
        
        # Generic EPS terms (lowest priority, but still valid)
        (r'per\s+share', 25),
        (r'eps', 20),
    ]
    
    scored_values = []
    for idx, entry in enumerate(eps_values):
        # Start with a base score
        score = 0
        row_text = entry['row_text'].lower().replace(':', '')
        
        # Debug - print the exact row text being matched
        print(f"Matching text: '{row_text}'")
        
        # Add score based on pattern matches
        matched_pattern = None
        for pattern, pattern_score in pattern_priorities:
            if re.search(pattern, row_text):
                score += pattern_score
                matched_pattern = pattern
                break  # Only apply the highest matching pattern
        
        # Add additional score components
        # 1. Prioritize basic over diluted
        if entry['basic']:
            score += 100
        elif entry['diluted']:
            score += 10
            
        # 2. Prioritize GAAP over non-GAAP
        if entry['gaap']:
            score += 20
        
        # 3. Check numerical reasonableness (EPS values are typically between -100 and 100)
        try:
            value_float = float(entry['value'])
            if -20 <= value_float <= 20:
                # Most reasonable EPS range
                score += 100  # Increased from 15 to 100
            elif -100 <= value_float <= 100:
                # Wider but still reasonable EPS range
                score += 50
            elif -1000 <= value_float <= 1000:
                # Unusual but possible range
                score -= 100  # Increased penalty
            else:
                # Very likely not an EPS value (e.g., shares outstanding, total earnings in millions)
                score -= 500  # Much stronger penalty
        except ValueError:
            score -= 50  
            
        scored_values.append({
            'index': idx,
            'score': score,
            'value': entry['value'],
            'original': entry,
            'row_text': row_text,
            'matched_pattern': matched_pattern
        })
    
    # Sort by score in descending order
    scored_values.sort(key=lambda x: x['score'], reverse=True)
    
    # For debugging
    for i, sv in enumerate(scored_values[:3]):  # Show top 3
        print(f"Candidate {i+1}: Score {sv['score']}, Value: {sv['value']}")
        print(f"  Row text: {sv['row_text'][:80]}...")
        print(f"  Matched pattern: {sv['matched_pattern']}")
    
    # Return the highest scoring value
    if scored_values:
        top_score = scored_values[0]['score']
        top_row_text = scored_values[0]['row_text']
        top_row_value = scored_values[0]['value']
        
        # Find all entries with the same row text and score
        identical_rows = [
            sv for sv in scored_values 
            if (sv['score'] == top_score and sv['row_text'] == top_row_text) and sv['value'] !=top_row_value
        ]
        
        if len(identical_rows) > 0:
            print(f"Found {len(identical_rows)} rows with identical text and score: '{top_row_text[:50]}...'")
            
            # Sum the values
            total_value = float(top_row_value)
            for row in identical_rows:
                try:
                    total_value += float(row['value'])
                except ValueError:
                    # Skip non-numeric values
                    print(f"Warning: Could not convert '{row['value']}' to float for summation")
            
            # Return the sum as a string with the same precision as the original values
            # (Get decimal places from the first value as a reference)
            try:
                decimal_places = len(identical_rows[0]['value'].split('.')[-1]) if '.' in identical_rows[0]['value'] else 0
                return f"{total_value:.{decimal_places}f}"
            except:
                # Fallback to basic formatting if precision detection fails
                return str(total_value)
        
        # If no identical rows, return the highest scoring value
        return scored_values[0]['value']

    
    return None
def process_directory(directory_path, verbose=False):
    """
    Process all HTML files in a directory to extract EPS values.
    
    Args:
        directory_path: Path to directory containing HTML filings
        verbose: Whether to print detailed information during extraction
        
    Returns:
        DataFrame containing all extracted EPS information
    """
    all_results = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.html'):
            file_path = os.path.join(directory_path, filename)
            results = extract_eps_from_filing(file_path, verbose=verbose)
            value = select_final_eps(results)

            all_results.append({ "filename": filename,"eps": value})
    
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_results)
    return df
def extract_numeric_value(text):
    """
    Extract and normalize a numeric value, handling negative numbers in various formats.
    Prioritizes decimal numbers over likely footnote references (small integers).
    
    Args:
        text: Text containing a potential numeric value
        
    Returns:
        Normalized numeric value as a string, or None if no valid number found
    """
    if not text:
        return None
        
    # Clean the text
    cleaned_text = text.replace('$', '').replace(',', '').strip()
    
    # First try to find decimal numbers (more likely to be actual values)
    decimal_match = re.search(r'([\d]+\.[\d]+)', cleaned_text)
    if decimal_match:
        # Found a number with decimal places - likely the real value
        
        # Check if it's in parentheses (negative)
        if (f"({decimal_match.group(1)})" in cleaned_text or 
            f"( {decimal_match.group(1)} )" in cleaned_text):
            return f"-{decimal_match.group(1)}"
            
        # Check for explicit negative signs
        if re.search(r'^\s*[\-−–]', cleaned_text):
            return f"-{decimal_match.group(1)}"
            
        # Check for opening parenthesis without closing (split across cells)
        if cleaned_text.startswith('(') and not cleaned_text.endswith(')'):
            return f"-{decimal_match.group(1)}"
            
        return decimal_match.group(1)
    
    # Handle case where opening parenthesis is present but closing one is missing
    if cleaned_text.startswith('(') and not cleaned_text.endswith(')'):
        match = re.search(r'\(?([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle normal parentheses case
    if '(' in cleaned_text and ')' in cleaned_text:
        # Extract the number inside parentheses
        match = re.search(r'\(([\d\.]+)\)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle numbers with explicit negative signs
    if re.search(r'^\s*[\-−–]', cleaned_text):  # Handle various dash characters
        match = re.search(r'([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Special case for closing parenthesis only - ignore it
    if cleaned_text == ')':
        return None
    
    # Look for integers, but filter out likely footnote references
    int_matches = re.findall(r'\b(\d+)\b', cleaned_text)
    if int_matches:
        # Filter out small integers that are likely footnotes
        valid_ints = [n for n in int_matches if len(n) > 2 or int(n) > 20]
        if valid_ints:
            # Return the first valid integer
            return valid_ints[0]
        
        # If only small integers found, return the largest one as a fallback
        # (Less likely to be a footnote reference)
        if int_matches:
            largest = max(int_matches, key=lambda x: int(x))
            # Only return if it's part of a longer text (not standalone footnote)
            if len(cleaned_text) > len(largest) + 3:
                return largest
    
    return None



In [98]:
# Process a single file
file_path = 'Training_Filings_test/0000046080-20-000050.html'
results = extract_eps_from_filing(file_path, verbose=True)


results

Processing file: Training_Filings_test/0000046080-20-000050.html
Found EPS pattern in row: reconciliation of net earnings and earnings per share...
Checking next row for values...
quarter ended
Checking next row for values...
(all adjustments reported after-tax)march 29, 2020diluted per share amountpro formamarch 31, 2019pro forma diluted per share amount (1)
Found value in next row: 29
Found value in next row: 31
Found value in next row: -1


[{'table_idx': 9,
  'row_text': 'reconciliation of net earnings and earnings per share',
  'basic': False,
  'diluted': True,
  'gaap': True,
  'value': '29',
  'all_values': ['29', '31', '-1']}]

In [93]:
select_final_eps(results)

'-0.03'

In [94]:
df =process_directory('Training_Filings_test', verbose=False)

df

Matching text: 'net earnings per share – basic $0.78 $1.23 (36.6)%'
Matching text: 'net earnings per share – diluted 0.78 1.23 (36.6)'
Matching text: 'shares used to compute earnings per share (000)'
Candidate 1: Score 1120, Value: 0.78
  Row text: net earnings per share – basic $0.78 $1.23 (36.6)%...
  Matched pattern: earnings\s+per\s+(?:common\s+)?share.*basic
Candidate 2: Score 930, Value: 0.78
  Row text: net earnings per share – diluted 0.78 1.23 (36.6)...
  Matched pattern: net\s+(?:income|earnings).*per\s+share
Candidate 3: Score 820, Value: -000
  Row text: shares used to compute earnings per share (000)...
  Matched pattern: earnings\s+per\s+(?:common\s+)?share
Matching text: 'earnings (loss) per common share'
Matching text: 'gaap net income (loss) and gaap diluted earnings (loss) per share$(10,643) $(0.41) $48,234 $1.84'
Matching text: 'non-gaap net income and diluted earnings per share$12,369 $0.47 $71,246 $2.71'
Candidate 1: Score 1070, Value: -0.41
  Row text: earnings (l

Unnamed: 0,filename,eps
0,0000004977-20-000054.html,0.78
1,0000008947-20-000044.html,-0.41
2,0000046080-20-000050.html,29.0
3,0000066570-20-000013.html,1.12
4,0000314808-20-000062.html,-15.19
5,0000706129-20-000012.html,0.26
6,0000846617-20-000024.html,0.47
7,0000874766-20-000033.html,0.74
8,0000875320-20-000014.html,2.32
9,0000892537-20-000010.html,0.71


In [95]:
actual_df= pd.read_csv('actual_values.csv')
actual_df = actual_df[['filename', 'actual_value']]
merged_df = pd.merge(df, actual_df, on='filename', how='inner')
merged_df['eps'] = merged_df['eps'].astype(float)
merged_df['actual_value'] = merged_df['actual_value'].astype(float)
exact_matches = (merged_df['eps'] == merged_df['actual_value']).sum()
exact_matches

49

In [96]:
mismatches = merged_df[merged_df['eps'] != merged_df['actual_value']]
mismatches

Unnamed: 0,filename,eps,actual_value
2,0000046080-20-000050.html,29.0,-0.51


## BELOW HERE IS TESTING CODE !!!!

In [47]:
merged_df

Unnamed: 0,filename,eps,actual_value
0,0000004977-20-000054.html,0.78,0.78
1,0000008947-20-000044.html,-0.41,-0.41
2,0000046080-20-000050.html,,-0.51
3,0000066570-20-000013.html,1.12,1.12
4,0000314808-20-000062.html,-15.19,-15.19
5,0000706129-20-000012.html,0.26,0.26
6,0000846617-20-000024.html,0.47,0.47
7,0000874766-20-000033.html,1.0,0.74
8,0000875320-20-000014.html,2.32,2.32
9,0000892537-20-000010.html,0.71,0.71


In [1]:
import pandas as pd 
from bs4 import BeautifulSoup
import os 
import re 
def extract_numeric_value(text):
    """Extract and normalize a numeric value, handling negative numbers in various formats"""
    if not text:
        return None
        
    # Clean the text
    cleaned_text = text.replace('$', '').replace(',', '').strip()
    
    # Handle case where opening parenthesis is present but closing one is missing
    # This happens when parentheses are split across cells
    if cleaned_text.startswith('(') and not cleaned_text.endswith(')'):
        match = re.search(r'\(?([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle normal parentheses case
    if '(' in cleaned_text and ')' in cleaned_text:
        # Extract the number inside parentheses
        match = re.search(r'\(([\d\.]+)\)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Handle numbers with explicit negative signs
    if re.search(r'^\s*[\-−–]', cleaned_text):  # Handle various dash characters
        match = re.search(r'([\d\.]+)', cleaned_text)
        if match:
            return f"-{match.group(1)}"
    
    # Special case for closing parenthesis only - ignore it
    if cleaned_text == ')':
        return None
    
    # Normal number extraction
    match = re.search(r'([\d\.]+)', cleaned_text)
    if match:
        return match.group(1)
    
    return None
def check_eps_pattern(text):
    """Expanded pattern checking for EPS-related text"""
    text = text.lower().strip()
    patterns = [
        r'(?:basic|diluted)?\s*earnings\s*(?:\(loss\))?\s*per\s*(?:common|outstanding)?\s*share',
        r'(?:basic|diluted)?\s*loss\s*per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*\(loss\)\s*per\s*(?:common|outstanding)?\s*share',
        r'net\s*(?:income|loss|earnings)\s*(?:attributable\s*to\s*[a-z\s]+)?\s*per\s*share',
        r'income\s*\(loss\)\s*per\s*share',
        r'\beps\b',
       # r'per\s*(?:common|outstanding)?\s*share',
        r'earnings\s*per\s*share'
    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            # Exclude weighted average share patterns
            if re.search(r'weighted|average|shares\s*outstanding', text, re.IGNORECASE):
                continue
            return True
    
    return False
def is_basic_eps(text):
    """Check if text refers to basic EPS (not diluted)"""
    try:
        text = text.lower()
        has_basic = bool(re.search(r'\bbasic\b', text))
        has_diluted = bool(re.search(r'\bdiluted\b', text))
        return has_basic,has_diluted
    except Exception as e:
        print(f"Error in is_basic_eps: {e}")
        return False
def is_gaap_eps(text):
    """Check if text refers to GAAP (not non-GAAP/adjusted) EPS"""
    text = text.lower()
    return not re.search(r'non-gaap|non\s*gaap|adjusted', text)
def select_eps_value(row_values, row_text, table_idx):
    """
    Helper method to select the appropriate EPS value based on priority rules
    and create the final EPS entry dictionary.
    
    Args:
        row_values: List of dictionaries containing EPS values and classifications
        row_text: Text from the row where EPS pattern was found
        table_idx: Table index for reference
        
    Returns:
        Dictionary with selected EPS information
    """
    if not row_values:
        return None
        
    # Default to the first value entry
    selected_entry = row_values[0]
    
    # First try to find basic EPS
    basic_values = [item for item in row_values if item['basic']]
    if basic_values:
        selected_entry = basic_values[0]
        print(f"Selected basic EPS value: {selected_entry['value']}")
    else:
        # If no basic found, try diluted
        diluted_values = [item for item in row_values if item['diluted']]
        if diluted_values:
            selected_entry = diluted_values[0]
            print(f"No basic EPS found, using diluted: {selected_entry['value']}")
        else:
            print(f"No specific classification found, using first value: {selected_entry['value']}")
    
    # Extract just the values for cleaner output
    value_list = [item['value'] for item in row_values]
    
    # Create the final EPS entry
    return {
        'table_idx': table_idx,
        'row_text': row_text[:100],  # Truncate for readability
        'basic': selected_entry['basic'],
        'diluted': selected_entry['diluted'],
        'gaap': selected_entry['gaap'],
        'value': selected_entry['value'],  # Prioritized value
        'all_values': value_list
    }


In [2]:
file_path = 'Training_Filings/0000066570-20-000013.html'
print(file_path)
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    html = f.read()
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
result = []
for table in tables:
    # Look for rows containing EPS terms
    rows = table.find_all('tr')
    for i, row in enumerate(rows):
        basic,diluted = None
        row_text = row.get_text().lower().strip()
        if check_eps_pattern(row_text):
            # Look for cells containing a value
            cells = row.find_all('td')
            print(f"Found EPS pattern in row: {row_text[:100]}...")
            for cell in cells:
                value = extract_numeric_value(cell.get_text())
                if value is not None: 
                    ###RETURN BASIC AND DILUTED AND VALUE WE WANT THE FIRST VALUE OF A ROW I THINK 
                    print(value)
                    basic,diluted = is_basic_eps(row_text)
                    eps_values.append({
                        'basic': basic,
                        'diluted': diluted,
                        'value': numeric_value
                    })
                else:
                    next_row = rows[i + 1]
                    next_cells = next_row.find_all('td')
                    print(f"No values found in current row, checking next row...")
                    for cell in next_cells:
                        value = extract_numeric_value(cell.get_text())
                        if value is not None:
                            print(f"Found value in next row: {value}")
            
                    


Training_Filings/0000066570-20-000013.html


TypeError: cannot unpack non-iterable NoneType object

In [33]:
file_path = 'Training_Filings_test/0000314808-20-000062.html'
print(file_path)

# Initialize the results list
eps_values = []

with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    html = f.read()

soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')

for table_idx, table in enumerate(tables):
    # Get all rows for sequential access
    rows = table.find_all('tr')
    
    for i, row in enumerate(rows):
        row_text = row.get_text().lower().strip()
        
        if check_eps_pattern(row_text):
            # Look for cells containing a value in current row
            cells = row.find_all('td')
            print(f"Found EPS pattern in row: {row_text[:100]}...")
            
            # Get basic/diluted classification
            basic, diluted = is_basic_eps(row_text)
            
            # Get GAAP classification
            gaap = is_gaap_eps(row_text)
            
            # Check if this row has values
            found_value = False
            row_values = []
            
            for cell in cells:
                value = extract_numeric_value(cell.get_text())
                if value is not None:
                    found_value = True
                    row_values.append({
                        'value': value,
                        'basic': basic,
                        'diluted': diluted,
                        'gaap': gaap
                    })
                    print(f"Found value in current row: {value}")
            
            # If no values found in current row, check the next row if available
            if not found_value and i + 1 < len(rows):
                next_row = rows[i + 1]
                next_cells = next_row.find_all('td')
                next_row_text = next_row.get_text().lower().strip()
                basic, diluted = is_basic_eps(next_row_text)
                gaap = is_gaap_eps(next_row_text)
                print(f"No values found in current row, checking next row...")
                
                for cell in next_cells:
                    value = extract_numeric_value(cell.get_text())
                    if value is not None:
                        row_values.append({
                            'value': value,
                            'basic': basic,
                            'diluted': diluted,
                            'gaap': gaap
                        })
                        print(f"Found value in next row: {value}")
            
            # If we found at least one value (in either current or next row), record it
            if row_values:
                eps_entry = select_eps_value(row_values, row_text, table_idx)
                eps_values.append(eps_entry)


Training_Filings_test/0000314808-20-000062.html
Found EPS pattern in row: loss per share - basic and diluted$(15.19) $(1.09)...
Found value in current row: -15.19
Found value in current row: -1.09
Found EPS pattern in row: loss per share reconciliation(1):three months ended...
Found value in current row: -1
Found EPS pattern in row: income (loss) from continuing operations attributable to valaris shares earnings (loss) per share fr...
No values found in current row, checking next row...
Found value in next row: -3006.3
Found value in next row: -15.19
Found value in next row: -216.0
Found value in next row: -1.09


In [34]:
eps_values

[{'table_idx': 7,
  'row_text': 'loss per share - basic and diluted$(15.19)\xa0$(1.09)',
  'basic': True,
  'diluted': True,
  'gaap': True,
  'value': '-15.19',
  'all_values': ['-15.19', '-1.09']},
 {'table_idx': 19,
  'row_text': 'loss per share reconciliation(1):three months ended',
  'basic': False,
  'diluted': False,
  'gaap': True,
  'value': '-1',
  'all_values': ['-1']},
 {'table_idx': 19,
  'row_text': 'income (loss) from continuing operations attributable to valaris shares\xa0earnings (loss) per share fr',
  'basic': False,
  'diluted': False,
  'gaap': True,
  'value': '-3006.3',
  'all_values': ['-3006.3', '-15.19', '-216.0', '-1.09']}]

Unnamed: 0,0,1,2,3,4,5,6,7
26,Earnings per share attributable to MSA Safety ...,,,,,,,
27,Basic,$,1.12,,,$,0.6,
28,Diluted,$,1.11,,,$,0.59,
