In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup

In [2]:
# Sample HTML content from your PDF (you would read this from your PDF extraction)
html_content = """
<div id="page0" style="width:612.0pt;height:792.0pt">
<p style="top:754.1pt;left:198.8pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">I </span></p>
<p style="top:754.1pt;left:268.7pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">II </span></p>
<p style="top:754.1pt;left:338.6pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">III </span></p>
<p style="top:576.9pt;left:198.8pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">aVR</span></p>
<p style="top:576.9pt;left:268.7pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">aVL</span></p>
<p style="top:576.9pt;left:338.6pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">aVF</span></p>
<p style="top:399.7pt;left:198.8pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V1 </span></p>
<p style="top:399.7pt;left:268.7pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V2 </span></p>
<p style="top:399.7pt;left:338.6pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V3 </span></p>
<p style="top:222.6pt;left:198.8pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V4 </span></p>
<p style="top:222.6pt;left:268.7pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V5 </span></p>
<p style="top:222.6pt;left:338.6pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V6 </span></p>
<p style="top:754.1pt;left:408.4pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V1 </span></p>
<p style="top:754.1pt;left:478.3pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">II </span></p>
<p style="top:754.1pt;left:548.2pt;line-height:9.1pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">V5 </span></p>
<p style="top:446.0pt;left:141.2pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">Referred by:    </span></p>
<p style="top:236.1pt;left:141.2pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">Electronically Signed By </span></p>
<p style="top:693.4pt;left:141.4pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">Indicated on </span></p>
<p style="top:511.7pt;left:41.4pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">45</span></p>
<p style="top:630.2pt;left:41.4pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">Vent. rate</span></p>
<p style="top:484.9pt;left:50.5pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">bpm</span></p>
<p style="top:516.3pt;left:50.5pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">i154</span></p>
<p style="top:630.2pt;left:50.5pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">PR interval</span></p>
<p style="top:484.9pt;left:59.5pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">max/Corrected</span></p>
<p style="top:630.2pt;left:68.6pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">QT/QTc</span></p>
<p style="top:532.4pt;left:68.6pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">492/425</span></p>
<p style="top:478.8pt;left:77.7pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">Temp intervals</span></p>
<p style="top:510.2pt;left:77.7pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">-.9</span></p>
<p style="top:526.4pt;left:77.7pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">X14</span></p>
<p style="top:628.2pt;left:77.7pt"><span style="font-family:Times New Roman,serif;font-size:9.1pt;color:#000000">P-R-T axes</span></p>
</div>
"""

In [3]:
def parse_ecg_features(html_content):
    """
    Parse ECG features from HTML content.
    Groups text elements by their vertical position (top value) to identify features on the same line.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract all text elements with their positions
    text_elements = []
    for p_tag in soup.find_all('p'):
        style = p_tag.get('style', '')
        span = p_tag.find('span')
        if span:
            text = span.get_text().strip()
            if text:
                # Extract top and left positions
                top_match = re.search(r'top:([\d.]+)pt', style)
                left_match = re.search(r'left:([\d.]+)pt', style)
                
                if top_match and left_match:
                    top = float(top_match.group(1))
                    left = float(left_match.group(1))
                    text_elements.append({
                        'text': text,
                        'top': top,
                        'left': left
                    })
    
    # Sort by top position, then by left position
    text_elements.sort(key=lambda x: (x['top'], x['left']))
    
    # Group elements that are on the same line (within 1pt tolerance)
    lines = []
    current_line = []
    current_top = None
    
    for elem in text_elements:
        if current_top is None or abs(elem['top'] - current_top) < 1:
            current_line.append(elem)
            current_top = elem['top']
        else:
            if current_line:
                lines.append(current_line)
            current_line = [elem]
            current_top = elem['top']
    
    if current_line:
        lines.append(current_line)
    
    # Combine text from each line
    combined_lines = []
    for line in lines:
        line_text = ' '.join([elem['text'] for elem in line])
        combined_lines.append(line_text)
    
    return combined_lines

# Parse the HTML
lines = parse_ecg_features(html_content)
print("Extracted lines:")
for i, line in enumerate(lines, 1):
    print(f"{i}. {line}")

Extracted lines:
1. V4 V5 V6
2. Electronically Signed By
3. V1 V2 V3
4. Referred by:
5. Temp intervals
6. bpm max/Corrected
7. -.9
8. 45
9. i154
10. X14
11. 492/425
12. aVR aVL aVF
13. P-R-T axes
14. Vent. rate PR interval QT/QTc
15. Indicated on
16. I II III V1 II V5


In [None]:
def extract_ecg_measurements_v2(html_content):
    """
    Enhanced extraction that looks at nearby elements to find values for labels.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract all text elements with their positions
    text_elements = []
    for p_tag in soup.find_all('p'):
        style = p_tag.get('style', '')
        span = p_tag.find('span')
        if span:
            text = span.get_text().strip()
            if text:
                top_match = re.search(r'top:([\d.]+)pt', style)
                left_match = re.search(r'left:([\d.]+)pt', style)
                
                if top_match and left_match:
                    top = float(top_match.group(1))
                    left = float(left_match.group(1))
                    text_elements.append({
                        'text': text,
                        'top': top,
                        'left': left
                    })
    
    measurements = {}
    
    # Find each measurement label and look for associated values
    for i, elem in enumerate(text_elements):
        text = elem['text']
        
        # Ventricular rate
        if 'Vent. rate' in text or text == 'Vent. rate':
            # Look for 'bpm' or numeric values nearby
            for other in text_elements:
                if abs(other['top'] - elem['top']) < 20:  # Within 20pt vertically
                    if 'bpm' in other['text']:
                        # Look for a number near 'bpm'
                        for val_elem in text_elements:
                            if abs(val_elem['top'] - other['top']) < 20:
                                numbers = re.findall(r'\d+', val_elem['text'])
                                if numbers and val_elem['text'].isdigit():
                                    measurements['Ventricular_Rate_bpm'] = int(numbers[0])
                                    break
        
        # PR interval
        if 'PR interval' in text or text == 'PR interval':
            # Look for numbers in nearby elements (same or adjacent left position)
            for other in text_elements:
                if abs(other['top'] - elem['top']) < 20:
                    if other['text'].isdigit():
                        measurements['PR_Interval_ms'] = int(other['text'])
                        break
        
        # QT/QTc
        if 'QT/QTc' in text or text == 'QT/QTc':
            # Look for pattern like "492/425"
            for other in text_elements:
                if abs(other['top'] - elem['top']) < 20:
                    qt_match = re.match(r'(\d+)/(\d+)', other['text'])
                    if qt_match:
                        measurements['QT_ms'] = int(qt_match.group(1))
                        measurements['QTc_ms'] = int(qt_match.group(2))
                        break
        
        # QRS duration
        if 'QRS' in text and 'duration' in text.lower():
            for other in text_elements:
                if abs(other['top'] - elem['top']) < 20:
                    numbers = re.findall(r'\d+', other['text'])
                    if numbers and other['text'].isdigit():
                        measurements['QRS_Duration_ms'] = int(numbers[0])
                        break
        
        # P-R-T axes
        if 'P-R-T axes' in text or text == 'P-R-T axes':
            # Look for pattern with numbers (might have negative values)
            for other in text_elements:
                if abs(other['top'] - elem['top']) < 20:
                    # Pattern like "-.9" or "X14"
                    numbers = re.findall(r'-?\d+\.?\d*', other['text'])
                    if numbers and len(numbers) >= 1:
                        try:
                            measurements['P_Axis_degrees'] = float(numbers[0])
                        except:
                            pass
    
    return measurements

# Extract measurements with improved version
measurements = extract_ecg_measurements_v2(html_content)
print("Extracted ECG Measurements:")
for key, value in measurements.items():
    print(f"{key}: {value}")

Extracted ECG Measurements:


In [None]:
# Create a DataFrame from the measurements
df = pd.DataFrame([measurements])
print("\nECG Measurements DataFrame:")
print(df.T)  # Transpose for better readability

## Enhanced ECG Parser for Grid-Like Format

This parser will extract ECG measurements organized in a grid structure with named rows for easy access.

In [None]:
def parse_ecg_grid(html_content):
    """
    Parse ECG data from HTML content organized in a grid-like format.
    Returns a dictionary with named rows for easy access.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract all text elements with their positions
    elements = []
    for p_tag in soup.find_all('p'):
        style = p_tag.get('style', '')
        span = p_tag.find('span')
        if span:
            text = span.get_text().strip()
            if text:
                top_match = re.search(r'top:([\d.]+)pt', style)
                left_match = re.search(r'left:([\d.]+)pt', style)
                
                if top_match and left_match:
                    top = float(top_match.group(1))
                    left = float(left_match.group(1))
                    elements.append({
                        'text': text,
                        'top': top,
                        'left': left
                    })
    
    # Sort by top, then by left
    elements.sort(key=lambda x: (x['top'], x['left']))
    
    # Group by rows (elements with similar 'top' values)
    rows = []
    current_row = []
    current_top = None
    tolerance = 2  # pts tolerance for same row
    
    for elem in elements:
        if current_top is None or abs(elem['top'] - current_top) <= tolerance:
            current_row.append(elem)
            if current_top is None:
                current_top = elem['top']
        else:
            if current_row:
                rows.append(current_row)
            current_row = [elem]
            current_top = elem['top']
    
    if current_row:
        rows.append(current_row)
    
    # Create structured data with named rows
    ecg_data = {}
    
    for row in rows:
        row_texts = [elem['text'] for elem in row]
        row_string = ' '.join(row_texts)
        
        # Identify and parse different types of rows
        
        # Row 1: Ventricular rate and BPM
        if any('Vent' in text or 'rate' in text for text in row_texts):
            row_name = 'Ventricular_Rate'
            # Find the numeric value
            for text in row_texts:
                if text.isdigit():
                    ecg_data[row_name] = {'value': int(text), 'unit': 'bpm', 'raw': row_string}
                    break
        
        # Row 2: PR interval
        elif any('PR' in text and 'interval' in text for text in row_texts):
            row_name = 'PR_Interval'
            for text in row_texts:
                if text.isdigit():
                    ecg_data[row_name] = {'value': int(text), 'unit': 'ms', 'raw': row_string}
                    break
        
        # Row 3: QRS duration
        elif any('QRS' in text for text in row_texts):
            row_name = 'QRS_Duration'
            for text in row_texts:
                if text.isdigit():
                    ecg_data[row_name] = {'value': int(text), 'unit': 'ms', 'raw': row_string}
                    break
        
        # Row 4: QT/QTc intervals
        elif any('QT' in text or 'QTc' in text for text in row_texts):
            row_name = 'QT_QTc'
            # Look for pattern like "492/425"
            for text in row_texts:
                match = re.match(r'(\d+)/(\d+)', text)
                if match:
                    ecg_data[row_name] = {
                        'QT': int(match.group(1)),
                        'QTc': int(match.group(2)),
                        'unit': 'ms',
                        'raw': row_string
                    }
                    break
        
        # Row 5: P-R-T axes
        elif any('P-R-T' in text or 'axes' in text for text in row_texts):
            row_name = 'P_R_T_Axes'
            # Extract numeric values (can be negative)
            values = []
            for text in row_texts:
                numbers = re.findall(r'-?\d+', text)
                if numbers:
                    values.extend([int(n) for n in numbers])
            
            if len(values) >= 3:
                ecg_data[row_name] = {
                    'P_axis': values[0],
                    'R_axis': values[1],
                    'T_axis': values[2],
                    'unit': 'degrees',
                    'raw': row_string
                }
            elif values:
                ecg_data[row_name] = {'values': values, 'unit': 'degrees', 'raw': row_string}
        
        # Row 6: RV5 and SV1 (if present)
        elif any('RV5' in text or 'SV1' in text for text in row_texts):
            row_name = 'RV5_SV1'
            values = {}
            for i, text in enumerate(row_texts):
                if 'RV5' in text and i + 1 < len(row_texts):
                    numbers = re.findall(r'\d+\.?\d*', row_texts[i + 1])
                    if numbers:
                        values['RV5'] = float(numbers[0])
                if 'SV1' in text and i + 1 < len(row_texts):
                    numbers = re.findall(r'\d+\.?\d*', row_texts[i + 1])
                    if numbers:
                        values['SV1'] = float(numbers[0])
            
            if values:
                ecg_data[row_name] = {**values, 'unit': 'mV', 'raw': row_string}
        
        # Row 7: RV5 + SV1 (if present)
        elif 'RV5+SV1' in row_string or ('RV5' in row_string and '+' in row_string):
            row_name = 'RV5_plus_SV1'
            for text in row_texts:
                numbers = re.findall(r'\d+\.?\d*', text)
                if numbers:
                    ecg_data[row_name] = {'value': float(numbers[0]), 'unit': 'mV', 'raw': row_string}
                    break
    
    return ecg_data

# Parse the ECG data
ecg_data = parse_ecg_grid(html_content)

print("Parsed ECG Grid Data:")
print("=" * 60)
for row_name, data in ecg_data.items():
    print(f"\n{row_name}:")
    print(f"  {data}")

In [None]:
# Convert to DataFrame with separate columns for better organization
def ecg_data_to_dataframe(ecg_data):
    """
    Convert parsed ECG data to a pandas DataFrame with well-organized columns.
    """
    rows = []
    
    for measurement_name, data in ecg_data.items():
        if 'value' in data:
            # Single value measurement
            rows.append({
                'Measurement': measurement_name,
                'Value': data['value'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
        elif 'QT' in data and 'QTc' in data:
            # QT/QTc has two values
            rows.append({
                'Measurement': f'{measurement_name}_QT',
                'Value': data['QT'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
            rows.append({
                'Measurement': f'{measurement_name}_QTc',
                'Value': data['QTc'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
        elif 'P_axis' in data:
            # P-R-T axes have three values
            rows.append({
                'Measurement': f'{measurement_name}_P',
                'Value': data['P_axis'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
            rows.append({
                'Measurement': f'{measurement_name}_R',
                'Value': data['R_axis'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
            rows.append({
                'Measurement': f'{measurement_name}_T',
                'Value': data['T_axis'],
                'Unit': data.get('unit', ''),
                'Raw_Text': data.get('raw', '')
            })
        else:
            # Handle other complex structures
            for key, value in data.items():
                if key not in ['unit', 'raw']:
                    rows.append({
                        'Measurement': f'{measurement_name}_{key}',
                        'Value': value,
                        'Unit': data.get('unit', ''),
                        'Raw_Text': data.get('raw', '')
                    })
    
    return pd.DataFrame(rows)

# Create DataFrame
df_ecg = ecg_data_to_dataframe(ecg_data)

print("\nECG Measurements DataFrame:")
print("=" * 80)
print(df_ecg.to_string(index=False))

# Also save the raw structured data
print("\n\nStructured Dictionary for Easy Access:")
print("=" * 80)
print("You can access measurements like:")
print("  ecg_data['Ventricular_Rate']['value']")
print("  ecg_data['QT_QTc']['QT']")
print("  ecg_data['P_R_T_Axes']['P_axis']")

In [None]:
# Example: Access specific measurements from the dictionary
print("Examples of accessing specific measurements:")
print("\n1. Using the structured dictionary:")

if 'Ventricular_Rate' in ecg_data:
    print(f"   Ventricular Rate: {ecg_data['Ventricular_Rate']['value']} {ecg_data['Ventricular_Rate']['unit']}")

if 'PR_Interval' in ecg_data:
    print(f"   PR Interval: {ecg_data['PR_Interval']['value']} {ecg_data['PR_Interval']['unit']}")

if 'QT_QTc' in ecg_data:
    print(f"   QT: {ecg_data['QT_QTc']['QT']} {ecg_data['QT_QTc']['unit']}")
    print(f"   QTc: {ecg_data['QT_QTc']['QTc']} {ecg_data['QT_QTc']['unit']}")

print("\n2. Using the DataFrame:")
# Filter by measurement name
pr_row = df_ecg[df_ecg['Measurement'] == 'PR_Interval']
if not pr_row.empty:
    print(f"   PR Interval: {pr_row.iloc[0]['Value']} {pr_row.iloc[0]['Unit']}")

# Get all measurements with specific pattern
qt_measurements = df_ecg[df_ecg['Measurement'].str.contains('QT', na=False)]
print(f"\n   QT-related measurements:")
print(qt_measurements[['Measurement', 'Value', 'Unit']].to_string(index=False))