<a id='advanced'></a>
## 4. Advanced Implementation with Libraries

Now let's explore more sophisticated NER approaches using modern libraries like spaCy and NLTK. We'll compare different methods and analyze their strengths and weaknesses for financial text analysis.

In [6]:
# Import required libraries
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy import displacy
import nltk
import scipy
from nltk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, Layout,widgets
from IPython.display import display, HTML, Markdown,clear_output
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import string
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')
  

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Set style for visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Custom CSS for highlighting entities
def apply_custom_styles():
    display(HTML("""
    <style>
    .entity-PERSON { background: #ffccd5; border-radius: 3px; padding: 0 3px; }
    .entity-ORG { background: #c3eeff; border-radius: 3px; padding: 0 3px; }
    .entity-GPE { background: #c1ffba; border-radius: 3px; padding: 0 3px; }
    .entity-LOC { background: #d6ffba; border-radius: 3px; padding: 0 3px; }
    .entity-MONEY { background: #ffe5a8; border-radius: 3px; padding: 0 3px; }
    .entity-TIME { background: #e5ceff; border-radius: 3px; padding: 0 3px; }
    .entity-DATE { background: #cecdff; border-radius: 3px; padding: 0 3px; }
    .entity-PERCENT { background: #bbffee; border-radius: 3px; padding: 0 3px; }
    .entity-CARDINAL { background: #eeedff; border-radius: 3px; padding: 0 3px; }
    .entity-TICKER { background: #ffbadd; border-radius: 3px; padding: 0 3px; }
    .entity-COMPANY { background: #c3eeff; border-radius: 3px; padding: 0 3px; }
    .entity-PRODUCT { background: #bbcefb; border-radius: 3px; padding: 0 3px; }
    .entity-QUANTITY { background: #e5ffbb; border-radius: 3px; padding: 0 3px; }
    </style>
    """))

apply_custom_styles()

# Sample financial texts to use throughout the notebook
sample_texts = {
    "earnings_report": "Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estimates by $0.15. Revenue was $97.3 billion, up 9% year-over-year. CEO Tim Cook mentioned strong iPhone sales in emerging markets.",
    "financial_news": "Microsoft (MSFT) stock rose 3.2% to $245.67 after announcing plans to invest $10 billion in OpenAI on January 15, 2023. The tech giant expects the partnership to generate significant revenue by 2025.",
    "sec_filing": "According to the 10-K filing, Tesla Inc. (TSLA) increased R&D spending to $3.1 billion in 2022, representing 5.7% of the total revenue. The company plans to launch new products in Q3 2023.",
    "market_analysis": "The S&P 500 fell 1.2% yesterday, with energy stocks like Exxon Mobil (XOM) and Chevron (CVX) dropping over 3%. The Federal Reserve's decision to maintain interest rates at 5.25% influenced market sentiment.",
    "complex_example": "In Q2 2023, Amazon.com Inc. (AMZN) acquired Zoox for $1.2 billion while reporting earnings of $2.63 per share. The deal closed on August 12, 2023, when AMZN was trading at $135.28."
}

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


In [8]:
def extract_entities_spacy(text):
    """Extract named entities using spaCy"""
    doc = nlp(text)
    
    # Standard spaCy entities
    entities = {ent.text: ent.label_ for ent in doc.ents}
    
    # Additional financial entity extraction
    for token in doc:
        # Find potential tickers (uppercase words in parentheses)
        if token.text == "(" and token.i + 1 < len(doc) and token.i + 2 < len(doc):
            next_token = doc[token.i + 1]
            next_next_token = doc[token.i + 2]
            
            if next_token.is_upper and next_next_token.text == ")":
                if next_token.text not in entities:
                    entities[f"({next_token.text})"] = "TICKER"
    
    return entities

def extract_entities_nltk(text):
    """Extract named entities using NLTK"""
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    named_entities = nltk.ne_chunk(pos_tags)
    
    entities = {}
    
    # Process named entities
    for chunk in named_entities:
        if hasattr(chunk, 'label'):
            entity = " ".join([token for token, pos in chunk.leaves()])
            entities[entity] = chunk.label()
    
    # Apply additional rules for financial entities
    for i, (token, pos) in enumerate(pos_tags):
        # Find monetary values
        if token.startswith('$') and i + 1 < len(pos_tags):
            next_token = pos_tags[i + 1][0]
            if next_token.lower() in ['million', 'billion', 'trillion', 'm', 'b', 't']:
                entities[f"{token} {next_token}"] = "MONEY"
            else:
                entities[token] = "MONEY"
        
        # Find percentages
        if token.endswith('%') and pos == 'CD':
            entities[token] = "PERCENT"
        
        # Find potential tickers
        if token == "(" and i + 1 < len(pos_tags) and i + 2 < len(pos_tags):
            next_token = pos_tags[i + 1][0]
            next_pos = pos_tags[i + 1][1]
            next_next_token = pos_tags[i + 2][0]
            
            if next_token.isupper() and next_next_token == ")":
                entities[f"({next_token})"] = "TICKER"
    
    return entities

def extract_entities_regex(text):
    """Extract entities using our custom regex approach"""
    entities = {}
    
    # Extract ticker symbols
    ticker_pattern = r'\(([A-Z]{1,5})\)'
    for match in re.finditer(ticker_pattern, text):
        ticker = match.group(0)  # Get the full match with parentheses
        entities[ticker] = "TICKER"
    
    # Extract monetary values
    money_pattern = r'\$\d+(?:\.\d+)?(?:\s?(?:billion|million|thousand|B|M|K))?'
    for match in re.finditer(money_pattern, text):
        entities[match.group(0)] = "MONEY"
    
    # Extract percentages
    percent_pattern = r'\d+(?:\.\d+)?%'
    for match in re.finditer(percent_pattern, text):
        entities[match.group(0)] = "PERCENT"
    
    # Extract dates
    date_pattern = r'(?:Q[1-4]\s?(?:20)?\d{2})|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4}'
    for match in re.finditer(date_pattern, text):
        entities[match.group(0)] = "DATE"
    
    # Extract company names
    company_pattern = r'([A-Z][a-zA-Z\.\s]+(?:Inc\.|Corp\.|Ltd\.|LLC|Group|Company))'
    for match in re.finditer(company_pattern, text):
        entities[match.group(0).strip()] = "COMPANY"
    
    # Extract person names
    person_pattern = r'(?:Mr\.|Mrs\.|Ms\.|Dr\.|CEO|CFO|CTO)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+){1,2})'
    for match in re.finditer(person_pattern, text):
        # Get the person's name including the title
        entities[match.group(0).strip()] = "PERSON"
    
    return entities

def compare_ner_methods(text):
    """Compare different NER methods on the same text"""
    # Get entities from each method
    spacy_entities = extract_entities_spacy(text)
    nltk_entities = extract_entities_nltk(text)
    regex_entities = extract_entities_regex(text)
    
    # Count entities by type for each method
    spacy_counts = {}
    for entity, label in spacy_entities.items():
        spacy_counts[label] = spacy_counts.get(label, 0) + 1
    
    nltk_counts = {}
    for entity, label in nltk_entities.items():
        nltk_counts[label] = nltk_counts.get(label, 0) + 1
    
    regex_counts = {}
    for entity, label in regex_entities.items():
        regex_counts[label] = regex_counts.get(label, 0) + 1
    
    # Create DataFrames for comparison
    spacy_df = pd.DataFrame([(entity, label) for entity, label in spacy_entities.items()], 
                           columns=["Entity", "Type"])
    nltk_df = pd.DataFrame([(entity, label) for entity, label in nltk_entities.items()], 
                          columns=["Entity", "Type"])
    regex_df = pd.DataFrame([(entity, label) for entity, label in regex_entities.items()], 
                           columns=["Entity", "Type"])
    
    # Display original text
    display(HTML(f"<h3>Original Text:</h3><p>{text}</p>"))
    
    # Display entity counts
    display(HTML("<h3>Entity Counts by Method:</h3>"))
    
    # Create comparison table
    all_labels = sorted(list(set(list(spacy_counts.keys()) + list(nltk_counts.keys()) + list(regex_counts.keys()))))
    comparison_data = []
    
    for label in all_labels:
        comparison_data.append({
            "Entity Type": label,
            "spaCy": spacy_counts.get(label, 0),
            "NLTK": nltk_counts.get(label, 0),
            "Regex": regex_counts.get(label, 0)
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    display(comparison_df)
    
    # Create bar chart comparison
    plt.figure(figsize=(12, 6))
    
    bar_width = 0.25
    x = np.arange(len(all_labels))
    
    plt.bar(x - bar_width, [spacy_counts.get(label, 0) for label in all_labels], 
            width=bar_width, label='spaCy', color='skyblue')
    plt.bar(x, [nltk_counts.get(label, 0) for label in all_labels], 
            width=bar_width, label='NLTK', color='lightgreen')
    plt.bar(x + bar_width, [regex_counts.get(label, 0) for label in all_labels], 
            width=bar_width, label='Regex', color='salmon')
    
    plt.xlabel('Entity Types')
    plt.ylabel('Count')
    plt.title('Entity Count Comparison by Method')
    plt.xticks(x, all_labels, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Display entities found by each method
    display(HTML("<h3>Entities Found by Method:</h3>"))
    
    # Highlight entities in text for each method
    spacy_highlighted = text
    for entity, label in sorted(spacy_entities.items(), key=lambda x: len(x[0]), reverse=True):
        spacy_highlighted = spacy_highlighted.replace(entity, f'<span class="entity-{label}">{entity}</span>')
    
    nltk_highlighted = text
    for entity, label in sorted(nltk_entities.items(), key=lambda x: len(x[0]), reverse=True):
        nltk_highlighted = nltk_highlighted.replace(entity, f'<span class="entity-{label}">{entity}</span>')
    
    regex_highlighted = text
    for entity, label in sorted(regex_entities.items(), key=lambda x: len(x[0]), reverse=True):
        regex_highlighted = regex_highlighted.replace(entity, f'<span class="entity-{label}">{entity}</span>')
    
    # Display highlighted text
    display(HTML("<h4>spaCy Entities:</h4>"))
    display(HTML(f"<p>{spacy_highlighted}</p>"))
    
    display(HTML("<h4>NLTK Entities:</h4>"))
    display(HTML(f"<p>{nltk_highlighted}</p>"))
    
    display(HTML("<h4>Regex Entities:</h4>"))
    display(HTML(f"<p>{regex_highlighted}</p>"))
    
    # Create tables of entities
    display(HTML("<h4>spaCy Entities:</h4>"))
    display(spacy_df)
    
    display(HTML("<h4>NLTK Entities:</h4>"))
    display(nltk_df)
    
    display(HTML("<h4>Regex Entities:</h4>"))
    display(regex_df)
    
    # Calculate performance metrics
    display(HTML("<h3>Performance Analysis:</h3>"))
    
    # Measure time performance
    import time
    
    start_time = time.time()
    extract_entities_spacy(text)
    spacy_time = time.time() - start_time
    
    start_time = time.time()
    extract_entities_nltk(text)
    nltk_time = time.time() - start_time
    
    start_time = time.time()
    extract_entities_regex(text)
    regex_time = time.time() - start_time
    
    # Display timing results
    timing_data = {
        "Method": ["spaCy", "NLTK", "Regex"],
        "Processing Time (s)": [spacy_time, nltk_time, regex_time]
    }
    timing_df = pd.DataFrame(timing_data)
    display(timing_df)
    
    # Plot timing comparison
    plt.figure(figsize=(8, 5))
    plt.bar(timing_df["Method"], timing_df["Processing Time (s)"], color=['skyblue', 'lightgreen', 'salmon'])
    plt.xlabel("Method")
    plt.ylabel("Processing Time (s)")
    plt.title("Processing Time Comparison")
    
    # Add timing labels on bars
    for i, time_val in enumerate(timing_df["Processing Time (s)"]):
        plt.text(i, time_val + 0.0001, f"{time_val:.6f}s", ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Display method comparison and recommendations
    display(HTML("""
    <h3>Method Comparison:</h3>
    <table style="width:100%; border-collapse: collapse;">
        <tr>
            <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Method</th>
            <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Strengths</th>
            <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Weaknesses</th>
            <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Best For</th>
        </tr>
        <tr>
            <td style="border: 1px solid #ddd; padding: 8px;"><b>spaCy</b></td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Pre-trained on diverse data<br>
                • Contextual understanding<br>
                • Handles complex language<br>
                • Supports custom training
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Not specialized for financial text<br>
                • May miss domain-specific entities<br>
                • More resource-intensive
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • General-purpose NER<br>
                • Projects requiring context<br>
                • Production environments
            </td>
        </tr>
        <tr>
            <td style="border: 1px solid #ddd; padding: 8px;"><b>NLTK</b></td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Well-established library<br>
                • Good for academic use<br>
                • Flexible rule creation
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Less contextual understanding<br>
                • Limited entity types<br>
                • Requires more custom rules
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Academic research<br>
                • Basic NER tasks<br>
                • Educational purposes
            </td>
        </tr>
        <tr>
            <td style="border: 1px solid #ddd; padding: 8px;"><b>Regex</b></td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Highly customizable<br>
                • Domain-specific patterns<br>
                • Fastest performance<br>
                • No external dependencies
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • No contextual understanding<br>
                • Prone to false positives<br>
                • Requires manual pattern creation<br>
                • Hard to maintain as patterns grow
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                • Financial-specific entities<br>
                • Simple document processing<br>
                • Pattern-based extraction
            </td>
        </tr>
    </table>
    
    <h3>Recommendations for Financial NER:</h3>
    <ol>
        <li><b>Hybrid Approach:</b> Combine spaCy for general entities with custom regex patterns for financial-specific entities</li>
        <li><b>Fine-tuned Models:</b> Consider fine-tuning spaCy on financial documents for better domain-specific performance</li>
        <li><b>Entity Verification:</b> Implement post-processing to validate extracted entities against financial knowledge bases</li>
        <li><b>Context Analysis:</b> Use surrounding text to disambiguate entities with multiple meanings</li>
    </ol>
    """))

# Create interactive comparison
def interactive_ner_comparison(text):
    compare_ner_methods(text)

# Create interactive widget
text_input = widgets.Textarea(
    value=sample_texts["earnings_report"],
    placeholder='Enter financial text...',
    description='Text:',
    layout=Layout(width='90%', height='100px')
)

examples = widgets.Dropdown(
    options=list(sample_texts.keys()),
    value='earnings_report',
    description='Example:',
    layout=Layout(width='50%')
)

# Function to update text when dropdown changes
def update_text(change):
    text_input.value = sample_texts[change['new']]

# Register callback for dropdown
examples.observe(update_text, names='value')

# Display widgets
display(examples)
display(text_input)

# Use interact for the comparison
interact(interactive_ner_comparison, text=text_input);

Dropdown(description='Example:', layout=Layout(width='50%'), options=('earnings_report', 'financial_news', 'se…

Textarea(value='Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estimates by $0.15. Revenue…

interactive(children=(Textarea(value='Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estim…

<a id='visualization'></a>
## 5. Data Flow Visualization

Let's visualize how text flows through the NER pipeline, from raw input to structured entities. This will help us understand each processing stage and how different components interact.

In [9]:
def visualize_ner_pipeline(text):
    """
    Visualize the NER pipeline and data flow for financial text analysis
    """
    # Step 1: Initialize pipeline stages
    stages = [
        "Raw Text Input",
        "Text Preprocessing",
        "Tokenization",
        "Entity Recognition",
        "Entity Classification",
        "Structured Output"
    ]
    
    # Step 2: Process text through each stage
    # Raw text is the input
    raw_text = text
    
    # Preprocessing (lowercase everything except potential entities, remove extra spaces)
    def preprocess_text(text):
        # Keep uppercase for potential entities
        processed = []
        for token in text.split():
            # Don't lowercase potential tickers, companies, etc.
            if re.match(r'\([A-Z]+\)', token) or re.match(r'^[A-Z][a-z]+$', token) or token.startswith('$'):
                processed.append(token)
            else:
                processed.append(token)
        return ' '.join(processed)
    
    preprocessed_text = preprocess_text(raw_text)
    
    # Tokenization
    tokens = nltk.word_tokenize(preprocessed_text)
    
    # Entity Recognition (using spaCy)
    doc = nlp(preprocessed_text)
    
    # Get entities from spaCy
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    # Add custom financial entities
    custom_entities = []
    
    # Extract tickers
    ticker_pattern = r'\(([A-Z]{1,5})\)'
    for match in re.finditer(ticker_pattern, preprocessed_text):
        ticker = match.group(0)  # Get the full match with parentheses
        if not any(ticker == e[0] for e in entities + custom_entities):
            custom_entities.append((ticker, "TICKER"))
    
    # Extract monetary values
    money_pattern = r'\$\d+(?:\.\d+)?(?:\s?(?:billion|million|thousand|B|M|K))?'
    for match in re.finditer(money_pattern, preprocessed_text):
        money = match.group(0)
        if not any(money == e[0] for e in entities + custom_entities):
            custom_entities.append((money, "MONEY"))
    
    # Extract percentages
    percent_pattern = r'\d+(?:\.\d+)?%'
    for match in re.finditer(percent_pattern, preprocessed_text):
        percent = match.group(0)
        if not any(percent == e[0] for e in entities + custom_entities):
            custom_entities.append((percent, "PERCENT"))
    
    # Combine spaCy and custom entities
    all_entities = entities + custom_entities
    
    # Entity Classification (organize by type)
    entity_by_type = {}
    for entity, label in all_entities:
        if label not in entity_by_type:
            entity_by_type[label] = []
        entity_by_type[label].append(entity)
    
    # Create structured output (JSON-like)
    structured_output = {
        "text": raw_text,
        "entities": entity_by_type
    }
    
    # Step 3: Visualize the pipeline
    display(HTML("<h3>NER Pipeline Visualization</h3>"))
    
    # Create pipeline diagram
    import networkx as nx
    
    G = nx.DiGraph()
    
    # Add nodes for stages
    for i, stage in enumerate(stages):
        G.add_node(stage, pos=(i, 0))
    
    # Add edges between stages
    for i in range(len(stages) - 1):
        G.add_edge(stages[i], stages[i + 1])
    
    plt.figure(figsize=(14, 3))
    pos = nx.get_node_attributes(G, 'pos')
    nx.draw(G, pos, with_labels=True, node_color='lightblue', 
            node_size=3000, font_size=10, font_weight='bold', 
            arrows=True, arrowsize=20)
    plt.title("NER Pipeline Flow")
    plt.tight_layout()
    plt.show()
    
    # Visualize content at each stage
    display(HTML("<h3>Data Flow Through Pipeline Stages</h3>"))
    
    # Stage 1: Raw Text
    display(HTML(f"<h4>Stage 1: {stages[0]}</h4>"))
    display(HTML(f"<p>{raw_text}</p>"))
    
    # Stage 2: Preprocessing
    display(HTML(f"<h4>Stage 2: {stages[1]}</h4>"))
    display(HTML(f"<p>{preprocessed_text}</p>"))
    
    # Stage 3: Tokenization
    display(HTML(f"<h4>Stage 3: {stages[2]}</h4>"))
    tokenized_html = "<div style='line-height: 2.5;'>"
    for token in tokens:
        tokenized_html += f'<span style="border: 1px solid #ccc; border-radius: 3px; padding: 3px; margin: 2px;">{token}</span> '
    tokenized_html += "</div>"
    display(HTML(tokenized_html))
    
    # Stage 4: Entity Recognition
    display(HTML(f"<h4>Stage 4: {stages[3]}</h4>"))
    # Create color mapping for entity types
    entity_colors = {
        "PERSON": "#ffccd5",
        "ORG": "#c3eeff",
        "GPE": "#c1ffba",
        "LOC": "#d6ffba",
        "MONEY": "#ffe5a8",
        "TIME": "#e5ceff",
        "DATE": "#cecdff",
        "PERCENT": "#bbffee",
        "CARDINAL": "#eeedff",
        "TICKER": "#ffbadd",
        "PRODUCT": "#bbcefb",
        "QUANTITY": "#e5ffbb"
    }
    
    # Highlight entities in text
    entity_recognition_html = raw_text
    for entity, label in sorted(all_entities, key=lambda x: len(x[0]), reverse=True):
        color = entity_colors.get(label, "#cccccc")
        entity_recognition_html = entity_recognition_html.replace(
            entity, 
            f'<span style="background-color: {color}; border-radius: 3px; padding: 2px;">{entity}</span>'
        )
    
    display(HTML(f"<p>{entity_recognition_html}</p>"))
    
    # Stage 5: Entity Classification
    display(HTML(f"<h4>Stage 5: {stages[4]}</h4>"))
    
    # Create a table to show entity classification
    classification_html = """
    <table style="width: 80%; border-collapse: collapse;">
      <tr>
        <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Entity Type</th>
        <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Entities</th>
      </tr>
    """
    
    for label, entities in entity_by_type.items():
        color = entity_colors.get(label, "#cccccc")
        entities_html = ", ".join([
            f'<span style="background-color: {color}; border-radius: 3px; padding: 2px;">{entity}</span>'
            for entity in entities
        ])
        
        classification_html += f"""
        <tr>
          <td style="border: 1px solid #ddd; padding: 8px;">{label}</td>
          <td style="border: 1px solid #ddd; padding: 8px;">{entities_html}</td>
        </tr>
        """
    
    classification_html += "</table>"
    display(HTML(classification_html))
    
    # Stage 6: Structured Output
    display(HTML(f"<h4>Stage 6: {stages[5]}</h4>"))
    
    # Display structured output
    from IPython.display import JSON
    display(JSON(structured_output))
    
    # Visualize entity distribution
    display(HTML("<h3>Entity Distribution</h3>"))
    
    # Create a bar chart of entity counts by type
    entity_counts = {label: len(entities) for label, entities in entity_by_type.items()}
    
    plt.figure(figsize=(10, 5))
    bars = plt.bar(entity_counts.keys(), entity_counts.values(), color=[entity_colors.get(label, "#cccccc") for label in entity_counts.keys()])
    plt.title("Entity Counts by Type")
    plt.xlabel("Entity Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    
    # Add count labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1, str(int(height)), 
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Create a Sankey diagram to show entity flow
    try:
        import plotly.graph_objects as go
        from plotly.subplots import make_subplots
        
        # Set up node labels and values for Sankey diagram
        label = ["Raw Text"]
        for stage in stages[1:]:
            label.append(stage)
        
        for entity_type, entities in entity_by_type.items():
            for entity in entities:
                label.append(f"{entity_type}: {entity}")
        
        # Create source, target pairs
        source = []
        target = []
        value = []
        
        # Connect stages
        for i in range(len(stages) - 1):
            source.append(i)
            target.append(i + 1)
            value.append(10)  # Constant width for main pipeline flow
        
        # Connect entities to Entity Classification
        entity_start_idx = len(stages)
        for entity_type, entities in entity_by_type.items():
            for entity in entities:
                # Connect Entity Recognition to this entity
                source.append(3)  # Entity Recognition stage
                target.append(entity_start_idx)
                value.append(1)  # Entity width
                
                # Connect this entity to Entity Classification
                source.append(entity_start_idx)
                target.append(4)  # Entity Classification stage
                value.append(1)  # Entity width
                
                entity_start_idx += 1
        
        # Create Sankey diagram
        fig = go.Figure(data=[go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=label
            ),
            link=dict(
                source=source,
                target=target,
                value=value
            )
        )])
        
        fig.update_layout(
            title_text="NER Data Flow",
            font_size=10,
            height=600
        )
        
        fig.show()
    except Exception as e:
        display(HTML(f"<p>Sankey diagram could not be displayed: {e}</p>"))
        # Fallback visualization
        display(HTML("<p>Simple data flow visualization:</p>"))
        # Add a simple alternative visualization here

# Create interactive widgets
examples = widgets.Dropdown(
    options=list(sample_texts.keys()),
    value='earnings_report',
    description='Example:',
    layout=Layout(width='50%')
)

text_input = widgets.Textarea(
    value=sample_texts['earnings_report'],
    placeholder='Enter financial text...',
    description='Text:',
    layout=Layout(width='90%', height='100px')
)

# Function to update text when dropdown changes
def update_text(change):
    text_input.value = sample_texts[change['new']]

# Register callback for dropdown
examples.observe(update_text, names='value')

# Display widgets
display(examples)
display(text_input)

# Use interact for the visualization
interact(visualize_ner_pipeline, text=text_input);

Dropdown(description='Example:', layout=Layout(width='50%'), options=('earnings_report', 'financial_news', 'se…

Textarea(value='Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estimates by $0.15. Revenue…

interactive(children=(Textarea(value='Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estim…

<a id='user-interaction'></a>
## 6. User Interaction & Visualization

This section focuses on creating interactive visualizations that allow users to explore named entities in financial documents. We'll implement real-time entity highlighting, entity relationship exploration, and parameter adjustments to enhance the analysis experience.

In [10]:
def create_entity_highlighter(text):
    """
    Create an interactive entity highlighter for financial text
    """
    # Process text with spaCy
    doc = nlp(text)
    
    # Extract entities using multiple methods
    spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    # Add custom financial entities
    financial_entities = []
    
    # Extract ticker symbols
    ticker_pattern = r'\(([A-Z]{1,5})\)'
    for match in re.finditer(ticker_pattern, text):
        ticker = match.group(0)  # Full match with parentheses
        financial_entities.append((ticker, 'TICKER'))
    
    # Extract monetary values
    money_pattern = r'\$\d+(?:\.\d+)?(?:\s?(?:billion|million|thousand|B|M|K))?'
    for match in re.finditer(money_pattern, text):
        financial_entities.append((match.group(0), 'MONEY'))
    
    # Extract percentages
    percent_pattern = r'\d+(?:\.\d+)?%'
    for match in re.finditer(percent_pattern, text):
        financial_entities.append((match.group(0), 'PERCENT'))
    
    # Combine all entities and remove duplicates
    all_entities = []
    seen_entities = set()
    
    for entity, label in spacy_entities + financial_entities:
        if entity not in seen_entities:
            all_entities.append((entity, label))
            seen_entities.add(entity)
    
    # Group entities by type
    entity_types = sorted(list(set(label for _, label in all_entities)))
    
    # Create checkboxes for each entity type
    type_checkboxes = [widgets.Checkbox(
        value=True,
        description=f"{entity_type}",
        disabled=False
    ) for entity_type in entity_types]
    
    # Create a checkbox for showing all entities
    all_checkbox = widgets.Checkbox(
        value=True,
        description='Show All',
        disabled=False,
        indent=False
    )
    
    # Create a layout for the checkboxes
    checkbox_layout = widgets.Layout(
        display='flex',
        flex_flow='row wrap',
        align_items='stretch',
        width='90%'
    )
    
    checkbox_container = widgets.Box(
        children=[all_checkbox] + type_checkboxes,
        layout=checkbox_layout
    )
    
    # Create a function to update the highlighted text
    def update_highlighted_text(*args):
        # If "Show All" is checked, check all entity types
        if all_checkbox.value:
            for checkbox in type_checkboxes:
                checkbox.value = True
        
        # Get selected entity types
        selected_types = [entity_types[i] for i, checkbox in enumerate(type_checkboxes) if checkbox.value]
        
        # Filter entities by selected types
        filtered_entities = [entity for entity, label in all_entities if label in selected_types]
        
        # Create highlighted HTML
        highlighted_html = text
        
        # Sort entities by length in descending order to avoid replacement issues
        for entity, label in sorted(all_entities, key=lambda x: len(x[0]), reverse=True):
            if label in selected_types:
                highlighted_html = highlighted_html.replace(
                    entity, 
                    f'<span class="entity-{label}">{entity}</span>'
                )
        
        # Display highlighted text
        display(HTML("<h3>Highlighted Entities:</h3>"))
        display(HTML(f"<p>{highlighted_html}</p>"))
        
        # Create entity counts table
        entity_counts = {}
        for entity, label in all_entities:
            if label in selected_types:
                entity_counts[label] = entity_counts.get(label, 0) + 1
        
        display(HTML("<h3>Entity Counts:</h3>"))
        if entity_counts:
            counts_html = """
            <table style="width:50%; border-collapse: collapse;">
              <tr>
                <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Entity Type</th>
                <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Count</th>
              </tr>
            """
            
            for label, count in entity_counts.items():
                counts_html += f"""
                <tr>
                  <td style="border: 1px solid #ddd; padding: 8px;">{label}</td>
                  <td style="border: 1px solid #ddd; padding: 8px;">{count}</td>
                </tr>
                """
            
            counts_html += "</table>"
            display(HTML(counts_html))
            
            # Create bar chart of entity counts
            plt.figure(figsize=(10, 5))
            plt.bar(entity_counts.keys(), entity_counts.values())
            plt.title("Entity Counts by Type")
            plt.xlabel("Entity Type")
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            for i, (label, count) in enumerate(entity_counts.items()):
                plt.text(i, count + 0.1, str(count), ha='center')
            plt.tight_layout()
            plt.show()
        else:
            display(HTML("<p>No entities selected.</p>"))
    
    # Register callback for all checkboxes
    all_checkbox.observe(update_highlighted_text, names='value')
    for checkbox in type_checkboxes:
        checkbox.observe(update_highlighted_text, names='value')
    
    # Display widgets
    display(HTML("<h3>Select Entity Types to Highlight:</h3>"))
    display(checkbox_container)
    
    # Initialize display
    update_highlighted_text()
    
    # Return the widgets for further use
    return {
        'all_checkbox': all_checkbox,
        'type_checkboxes': type_checkboxes,
        'entity_types': entity_types
    }

def visualize_entity_relationships(text):
    """
    Visualize relationships between entities in financial text
    """
    # Process text with spaCy
    doc = nlp(text)
    
    # Extract entities and their positions
    entities = []
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'start': ent.start_char,
            'end': ent.end_char
        })
    
    # Add custom financial entities
    # Extract ticker symbols
    ticker_pattern = r'\(([A-Z]{1,5})\)'
    for match in re.finditer(ticker_pattern, text):
        entities.append({
            'text': match.group(0),
            'label': 'TICKER',
            'start': match.start(),
            'end': match.end()
        })
    
    # Extract monetary values
    money_pattern = r'\$\d+(?:\.\d+)?(?:\s?(?:billion|million|thousand|B|M|K))?'
    for match in re.finditer(money_pattern, text):
        entities.append({
            'text': match.group(0),
            'label': 'MONEY',
            'start': match.start(),
            'end': match.end()
        })
    
    # Extract percentages
    percent_pattern = r'\d+(?:\.\d+)?%'
    for match in re.finditer(percent_pattern, text):
        entities.append({
            'text': match.group(0),
            'label': 'PERCENT',
            'start': match.start(),
            'end': match.end()
        })
    
    # Remove duplicate entities
    unique_entities = []
    seen_spans = set()
    
    for entity in entities:
        span = (entity['start'], entity['end'])
        if span not in seen_spans:
            unique_entities.append(entity)
            seen_spans.add(span)
    
    # Sort entities by position
    unique_entities.sort(key=lambda x: x['start'])
    
    # Create dependency visualization
    display(HTML("<h3>Entity Relationships:</h3>"))
    
    # Calculate proximity between entities
    relationships = []
    
    for i, entity1 in enumerate(unique_entities):
        for j, entity2 in enumerate(unique_entities):
            if i < j:  # Only consider each pair once
                # Calculate token distance
                distance = abs(entity1['start'] - entity2['start'])
                
                # Entities within certain character distance are considered related
                if distance < 50:  # Adjustable threshold
                    relationships.append({
                        'source': entity1['text'],
                        'source_type': entity1['label'],
                        'target': entity2['text'],
                        'target_type': entity2['label'],
                        'weight': 1 / (distance + 1)  # Weight inversely proportional to distance
                    })
    
    # Create a network graph of entity relationships
    import networkx as nx
    
    G = nx.Graph()
    
    # Add nodes for each entity
    for entity in unique_entities:
        G.add_node(entity['text'], type=entity['label'])
    
    # Add edges for relationships
    for relation in relationships:
        G.add_edge(
            relation['source'], 
            relation['target'], 
            weight=relation['weight']
        )
    
    # Display the network graph
    plt.figure(figsize=(12, 8))
    
    # Create a color map for entity types
    entity_types = list(set(entity['label'] for entity in unique_entities))
    color_map = plt.cm.get_cmap('tab10', len(entity_types))
    type_to_color = {entity_type: color_map(i) for i, entity_type in enumerate(entity_types)}
    
    node_colors = [type_to_color[G.nodes[node]['type']] for node in G.nodes()]
    
    # Create layout
    pos = nx.spring_layout(G, k=0.3)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=700, alpha=0.8)
    
    # Draw edges with width proportional to weight
    edge_weights = [G[u][v]['weight'] * 3 for u, v in G.edges()]
    nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.3)
    
    # Draw node labels
    nx.draw_networkx_labels(G, pos, font_size=8, font_family='sans-serif')
    
    # Add legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, 
                                 label=entity_type, markersize=10) 
                      for entity_type, color in type_to_color.items()]
    plt.legend(handles=legend_elements, title="Entity Types")
    
    plt.title("Entity Relationship Network")
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    # Create an adjacency matrix visualization
    if len(unique_entities) > 1:
        # Get the adjacency matrix
        A = nx.adjacency_matrix(G).todense()
        
        # Create labels for the matrix
        labels = [entity['text'] for entity in unique_entities]
        
        # Plot the adjacency matrix
        plt.figure(figsize=(10, 8))
        plt.imshow(A, cmap='Blues')
        plt.colorbar(label='Connection Strength')
        plt.title("Entity Adjacency Matrix")
        
        # Add labels
        plt.xticks(range(len(labels)), labels, rotation=90)
        plt.yticks(range(len(labels)), labels)
        
        plt.tight_layout()
        plt.show()
    
    # Create a co-occurrence table
    display(HTML("<h3>Entity Co-occurrence:</h3>"))
    
    if relationships:
        # Create a DataFrame of relationships
        relationship_df = pd.DataFrame(relationships)
        
        # Display the table
        display(relationship_df)
        
        # Create a heatmap of entity co-occurrence
        pivot_table = pd.crosstab(
            relationship_df['source_type'], 
            relationship_df['target_type']
        )
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='d')
        plt.title("Entity Type Co-occurrence")
        plt.tight_layout()
        plt.show()
    else:
        display(HTML("<p>No significant entity relationships detected.</p>"))

def create_interactive_document_analyzer():
    """
    Create an interactive document analyzer with real-time entity extraction and visualization
    """
    # Create tabs for different visualizations
    tab1 = widgets.Output()  # Entity Highlighting
    tab2 = widgets.Output()  # Entity Relationships
    
    # Create tabs widget
    tabs = widgets.Tab(children=[tab1, tab2])
    tabs.set_title(0, 'Entity Highlighting')
    tabs.set_title(1, 'Entity Relationships')
    
    # Create text input widget
    text_input = widgets.Textarea(
        value=sample_texts['earnings_report'],
        placeholder='Enter financial text...',
        description='Text:',
        layout=Layout(width='90%', height='100px')
    )
    
    # Create example dropdown
    examples = widgets.Dropdown(
        options=list(sample_texts.keys()),
        value='earnings_report',
        description='Example:',
        layout=Layout(width='50%')
    )
    
    # Function to update text when dropdown changes
    def update_text(change):
        text_input.value = sample_texts[change['new']]
        update_tabs()
    
    # Register callback for dropdown
    examples.observe(update_text, names='value')
    
    # Function to update tab content
    def update_tabs():
        with tab1:
            clear_output()
            create_entity_highlighter(text_input.value)
        
        with tab2:
            clear_output()
            visualize_entity_relationships(text_input.value)
    
    # Function to handle text changes
    def on_text_change(change):
        update_tabs()
    
    # Register callback for text changes
    text_input.observe(on_text_change, names='value')
    
    # Display widgets
    display(examples)
    display(text_input)
    display(tabs)
    
    # Initialize tabs
    update_tabs()

# Create the interactive document analyzer
create_interactive_document_analyzer()

Dropdown(description='Example:', layout=Layout(width='50%'), options=('earnings_report', 'financial_news', 'se…

Textarea(value='Apple Inc. (AAPL) reported Q2 earnings of $1.52 per share, beating estimates by $0.15. Revenue…

Tab(children=(Output(), Output()), selected_index=0, titles=('Entity Highlighting', 'Entity Relationships'))