# Named Entity Recognition Table Extraction

* Train a named entity recognition model (need to create training data)
* Apply trained NER to label entities in docs
* Extract the Ex. 21 entities
* Structure into dataframe

In [None]:
!pip install spacy

In [None]:
# Download spacy model
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import pandas as pd
from bs4 import BeautifulSoup as bs

In [None]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')

In [None]:
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

def extract_data_into_dataframe(text):
    entities = extract_entities(text)
    df = pd.DataFrame(entities, columns=['Text', 'Entity'])
    return df


In [None]:
example_text = "Katie lives in Cupertino, California. Her subsidiaries are Pet Dog Company and she owns 100%."
df = extract_data_into_dataframe(example_text)
print(df)

In [None]:
def get_entities_from_html(filename, keywords, tag_type="body"):
    with open(filename, 'r') as file:
        html_content = file.read()
    soup = bs(html_content, 'html.parser')
    # Define a function to filter body elements based on text content
    def contains_keyword(tag, keywords=keywords):
        return tag.name == tag_type and keywords in tag.get_text()
    
    # Find all body elements containing the keyword
    body_elements_with_keyword = soup.find_all(contains_keyword)

    dfs = []
    for body_element in body_elements_with_keyword:
        text = body_element.get_text()
        dfs.append(extract_data_into_dataframe(text))
    return dfs

In [None]:
get_entities_from_html(filename="torotel.html", keywords="SUBSIDIARIES OF THE REGISTRANT")

In [None]:
get_entities_from_html(filename="wisconsin_electric.html", keywords="Subsidiary", tag_type="table")

In [None]:
get_entities_from_html(filename="trans_lux_corp.html", keywords="Percentage", tag_type="table")

In [None]:
get_entities_from_html(filename="isle_of_capri.html", keywords="SUBSIDIARIES", tag_type="table")