# Named Entity Recognition Table Extraction

* Train a named entity recognition model (need to create training data)
* Apply trained NER to label entities in docs
* Extract the Ex. 21 entities
* Structure into dataframe

In [None]:
!pip install spacy

In [None]:
# Download spacy model
!python -m spacy download en_core_web_sm

In [12]:
import spacy
import pandas as pd
from bs4 import BeautifulSoup as bs

In [5]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')

In [6]:
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

def extract_data_into_dataframe(text):
    entities = extract_entities(text)
    df = pd.DataFrame(entities, columns=['Text', 'Entity'])
    return df


In [9]:
example_text = "Katie lives in Cupertino, California. Her subsidiaries are Pet Dog Company and she owns 100%."
df = extract_data_into_dataframe(example_text)
print(df)

              Text   Entity
0            Katie   PERSON
1        Cupertino      GPE
2       California      GPE
3  Pet Dog Company      ORG
4             100%  PERCENT


In [24]:
def get_entities_from_html(filename, keywords, tag_type="body"):
    with open(filename, 'r') as file:
        html_content = file.read()
    soup = bs(html_content, 'html.parser')
    # Define a function to filter body elements based on text content
    def contains_keyword(tag, keywords=keywords):
        return tag.name == tag_type and keywords in tag.get_text()
    
    # Find all body elements containing the keyword
    body_elements_with_keyword = soup.find_all(contains_keyword)

    dfs = []
    for body_element in body_elements_with_keyword:
        text = body_element.get_text()
        dfs.append(extract_data_into_dataframe(text))
    return dfs

In [18]:
get_entities_from_html(filename="torotel.html", keywords="SUBSIDIARIES OF THE REGISTRANT")

[          Text    Entity
 0           21  CARDINAL
 1   Subsidiary    PERSON
 2     Missouri       GPE
 3       Nevada       GPE
 4   Subsidiary    PERSON
 5  Electronika       GPE
 6       Kansas       GPE
 7       Kansas       GPE
 8       Kansas       GPE]

In [25]:
get_entities_from_html(filename="wisconsin_electric.html", keywords="Subsidiary", tag_type="table")

[                                                Text   Entity
 0  Organization\n\nOwnership\n\n \n \n \n\n\nATC ...      ORG
 1                                          Wisconsin      GPE
 2                                            42.482%  PERCENT
 3                      American Transmission Company      ORG
 4                                          Wisconsin      GPE
 5                                            37.295%  PERCENT]

In [27]:
get_entities_from_html(filename="trans_lux_corp.html", keywords="Percentage", tag_type="table")

[                                 Text    Entity
 0                              Canada       GPE
 1                                100%   PERCENT
 2    Trans-Lux Commercial Corporation       ORG
 3                                Utah       GPE
 4                                 100  CARDINAL
 5       Trans-Lux Display Corporation       ORG
 6                            Delaware       GPE
 7                                 100  CARDINAL
 8    Trans-Lux Experience Corporation       ORG
 9                            New York       GPE
 10                                100  CARDINAL
 11       Trans-Lux Energy Corporation       ORG
 12                        Connecticut       GPE
 13                                100  CARDINAL
 14      Trans-Lux Midwest Corporation       ORG
 15                               Iowa       GPE
 16                                100  CARDINAL
 17      Trans-Lux Seaport Corporation       ORG
 18                           New York       GPE
 19                 

In [29]:
get_entities_from_html(filename="isle_of_capri.html", keywords="SUBSIDIARIES", tag_type="table")

[                                    Text  Entity
 0                                 WHOLLY     GPE
 1                    Black Hawk Holdings     ORG
 2                                 L.L.C.     GPE
 3                               Colorado     GPE
 4                Capri Insurance Company     ORG
 5                                 Hawaii     GPE
 6                               Colorado     GPE
 7                               Colorado     GPE
 8                               Colorado     GPE
 9                                    LLC     ORG
 10                              Colorado     GPE
 11                                Nevada     GPE
 12                    IOC-Caruthersville     ORG
 13                                 L.L.C     ORG
 14                              Missouri     GPE
 15                       IOC-Kansas City     GPE
 16                              Missouri     GPE
 17                              IOC-Lula     ORG
 18                           IOC-Natchez     GPE
