In [1]:
# HTTP
import requests

# HTML/XML parsing
from bs4 import BeautifulSoup, Tag, NavigableString

# Text normalization
import unicodedata

# Parsing Notes and Text from 10-Q's and 10-K's
10-Q's generally follow structure given at: https://www.sec.gov/files/form10-q.pdf

```
PART I - FINANCIAL INFORMATION
  Item 1. Financial Statements
  Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations
  Item 3. Quantitative and Qualitative Disclosures about Market Risk
  Item 4. Controls and Procedures


PART II - OTHER INFORMATION
  Item 1. Legal Proceedings
  Item 1A. Risk Factors
...
```

Similarly 10-K's the structure given at: https://www.sec.gov/files/form10-k.pdf

```
PART I
  Item 1. Business.
  Item 1A. Risk Factors.
  Item 1B. Unresolved Staff Comments.
  Item 2. Properties.
  Item 3. Legal Proceedings.
  Item 4. Mine Safety Disclosures.

PART II
  Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases
  of Equity Securities.
  Item 6. [Reserved]
  Item 7. Management’s Discussion and Analysis of Financial Condition and Results 
  Item 7A. Quantitative and Qualitative Disclosures About Market Risk.
  Item 8. Financial Statements and Supplementary Data.
  Item 9. Changes in and Disagreements With Accountants on Accounting and 
  Item 9A. Controls and Procedures.
  Item 9B. Other Information.
  Item 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections.
...
```
Both types of filings can contain what are referred to in the XBRL view of the filing as "notes to the financial statements". These are hinted at by the exact name of Item 8 of the 10-K format: "Financial Statements **and Supplementary Data**". Despite these different formats, the same methods can be used to pull notes to the financial statements from both modern (XBRL enabled) 10-Q and 10-K filings, similarly to how the same methods can be used to extract financial statement information. In FilingSummary.xml, each note-to-the-financial-statements will have its own report with a MenuCategory of "Notes" (whereas it was "Statements" for financial statements). Pre-XBRL filings are handled using methods with the keyword "legacy" in their name and are explained in further detail below. A parsing of the full text file is necessary.

Also keep in mind that there are many other text sections in the filings: for example one included in both 10-Q's and K's is "risk factors". We will explore methods to extract text from these sections further on in the notebook. See "Grabbing Other Text".




# Notes to Financial Statements for Modern Filings (XBRL enabled)
We use the same concepts as when looking for financial reports, except we are looking for reports of MenuCategory "Notes". 

In [2]:
# Copied from 10q_k_financial_parsing. Simply parse index.json and confirm existence of FilingSummary.xml
def confirm_modern_filing_summary(root_filing_dir):

  # Need to set user agent for sec.gov
  request_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

  # Request root_filing_dir/index.json
  index_path = r"{}index.json".format(root_filing_dir)
  response = requests.get(url = index_path, headers = request_headers)
  response.raise_for_status()

  index_json = response.json()
  
  # directory->item is a list of dictionaries, one per each file in the directory
  # Make sure that the "name" key of one of these dictionaries is "FilingSummary.xml"
  try:
    for current_item in index_json["directory"]["item"]:
      if current_item["name"] == "FilingSummary.xml":
        return root_filing_dir + "FilingSummary.xml"

  except:
    pass
  return ""

This method is similar to find_modern_financial_reports of 10q_k_financial_parsing, but will return a dict of Notes reports found in the filing summary, in the following format:

```
{
  "REPORT_SHORTNAME" : "URL",
  ...
}
```



In [143]:
# Takes the path to the relevent FilingSummary.xml as input. Returns all Notes reports
def find_modern_notes_reports(filing_summary_path):

  # Returned dictionary
  notes_dict = {}
  
  # Base URL to build report paths from
  base_filing_path = filing_summary_path.replace("FilingSummary.xml", "")

  # GET FilingSummary.xml
  request_headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"}
  response = requests.get(url = filing_summary_path, headers = request_headers)
  response.raise_for_status()

  xml_content = response.content
  soup_summary = BeautifulSoup(xml_content, "lxml")

  # Find MyReports and loop through them
  reports = soup_summary.find("myreports")
  for current_report in reports.find_all("report"):
    
    try: # (Not all reports will have MenuCategory)
      if current_report.menucategory.text.lower() == "notes":

        # Insert into dictionary if MenuCategory is Notes
        notes_dict[current_report.shortname.text] = base_filing_path + current_report.htmlfilename.text.strip()

    except:
      pass

  return notes_dict

In [111]:
print(find_modern_notes_reports("https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/FilingSummary.xml"))

{'ORGANIZATION AND BUSINESS BACKGROUND': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R7.htm', 'SUMMARY OF SIGNIFICANT ACCOUNTING POLICIES': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R8.htm', 'PREPAYMENT': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R9.htm', 'PLANT AND EQUIPMENT': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R10.htm', 'ACCRUED LIABILITIES': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R11.htm', 'AMOUNT DUE TO A DIRECTOR': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R12.htm', 'SHAREHOLDERS??? EQUITY': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R13.htm', 'CONCENTRATION OF RISK': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R14.htm', 'INCOME TAX': 'https://www.sec.gov/Archives/edgar/data/1873213/000149315222028024/R15.htm', 'SEGMENT REPORTING': 'https://www.sec.gov/Archives/edgar/data

The structure of the Notes reports are very similar to the Statements reports, in that information is contained in an HTML table of the following structure:


```
<table class="report" border="0" cellspacing="2" id="idmXXXXXXXXXXXXXXX">
  <tbody>
    <tr>... table headers row (basically defines table columns) ...</tr>
      <th class=...> one "th" object per column </th>
      <th class=...> there seems to be two header columns in most Notes reports </th>
    <tr class=rXX>...</tr>
    <tr class="rXX">... row containing text ...</tr>
      <td class=...>...</td>
      <td class="text"> 
        <p ...>TARGET TEXT HERE</p>
        <table ...>TARGET TABLE HERE</table>
      </td>
  </tbody>
</table>
```
From the reports I have examined, there are two columns in these Notes tables. The left columns are mostly redundant/useless (in terms of the headings/text they hold), apart from the left column of the first header row which holds what I would call the essential "name" of the table. When parsing through the rows of the table, we will record table headers (`<th>` elements) and paragraphs of the text columns relying the structure above. 


In [3]:
# unicode.normalize leaves behind a couple of not technically whitespace control-characters. See https://www.geeksforgeeks.org/python-program-to-remove-all-control-characters/ and http://www.unicode.org/reports/tr44/#GC_Values_Table
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") 

In [4]:
# Cleans the given text (specifically: unicode normalize and turn newlines/whitespace into a single space)
def clean_column_text(text_to_clean):

  clean_text = text_to_clean.replace('\n', ' ') # Split doesn't catch newlines from my testing
  clean_text = " ".join(clean_text.split()) # Split string along tabs and spaces, then rejoin the parts with single spaces instead
  clean_text = unicodedata.normalize('NFKD', clean_text)
  clean_text = remove_control_characters(clean_text)
  
  return clean_text

In [5]:
"""
Pulls the headers and text data from a specific Notes report.
Returns a dictionary of the structure:
{
  "header_vals" : [], # list of strings pulled from header rows
  "text_vals" : [] # list of strings pulled from columns of class "text". Each paragraph will be its own string in the list
}
"""
def read_modern_notes_report_tables(report_document_url):

  # Returned structure
  table_data = {
      "header_vals" : [],
      "text_vals" : []
  }

  # Request the document contents
  request_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }
  response = requests.get(url = report_document_url, headers = request_headers)
  response.raise_for_status()

  html_content = response.content
  report_soup = BeautifulSoup(html_content, 'lxml')

  # In case there are multiple tables in the document, loop through all of those labeled with the "report" class. Tables of other classes (i.e. of type "authRefData") are ignored.
  for table_index, current_table in enumerate(report_soup.find_all('table', class_ = "report")):

    # Loop through rows 
    for row_index, current_row in enumerate(current_table.find_all('tr')):

      # Header row if <th> element is found
      header_columns = current_row.find_all('th')
      if len(header_columns) != 0:

        # Strip the text from each column and append it to headers master list
        for hdr_column in header_columns:
          table_data["header_vals"].append(hdr_column.text.strip())

      # Not a header row, look for columns of class "text"
      else:
        text_columns = current_row.find_all('td', class_ = "text")
       
        # Strip the text from each column and append it to text_vals master list
        # TODO: OPTIMIZE FOR SUB-TABLES WITHIN THE NOTE. Formatting is a bit janky / unpreserved right now
        for txt_column in text_columns:
            
          # Loop through the children of the text column
          for child in txt_column.children:
              
            # Grab paragraphs and tables
            if (child.name == 'p' or child.name == 'table'):
              
              # Ignore empty paragraphs/spacers
              child_text = clean_column_text(child.text.strip())
              if len(child_text):
                table_data["text_vals"].append(child_text)

  return table_data

In [None]:
# POC
filing_summary = confirm_modern_filing_summary("https://www.sec.gov/Archives/edgar/data/1463972/000155837022012864/")
notes_reports = find_modern_notes_reports(filing_summary)
for key, val in notes_reports.items():
  table_d = read_modern_notes_report_tables(val)
  print("Note to financial statements: {}".format(key))
  print(table_d["text_vals"])
  input()

# Grabbing Other Text: Risk Factors, Legal Proceedings, etc (HTML enabled filings)
Not all the text sections of the filing are contained in the notes to the financial statements. Thus it is best to also try to parse the full text submission of the filing according to the structure outlined aboved (sample 10-Q and 10-K). The basic idea is to first look for a table of contents, which most newer filings seem to include. If found, use it to locate key sections we want to pull the text from. If no table of contents, resort to other methods such as looking for centered or bolded section headers. 

In [6]:
# Attempts to locate a table of contents by looking for a <table> element containing one or more <href> elements
# Returns the bs4.Element.Tag object of that table if it exists, or None
def linked_toc_exists(document_soup):

  # Find all <table> tags 
  all_tables = document_soup.find_all('table')
  for cur_table in all_tables:

    # Look for an <a href=...> 
    links = cur_table.find_all('a', attrs = { 'href' : True })
    if len(links):
      return cur_table

  return None

In [7]:
# Helper method to find_section_with_toc, extracts the text found inbetween 2 bs4 Tags/elements 
def text_between_tags(start, end):

  cur = start
  found_text = ""

  # Loop through all elements inbetween the two
  while cur and cur != end:
    if isinstance(cur, NavigableString):

      text = cur.strip()
      if len(text):
        found_text += "{} ".format(text)

    cur = cur.next_element
  
  return clean_column_text(found_text.strip()) # Strip trailing space that the above pattern will result in

In [8]:
# Help method to find_section_with_toc, extracts the text found starting at a given tag through the end of the soup
def text_starting_at_tag(start):
  
  cur = start
  found_text = ""

  # Loop through all elements
  while cur:
    if isinstance(cur, NavigableString):

      text = cur.strip()
      if len(text):
        found_text += "{} ".format(text)

    cur = cur.next_element

  return clean_column_text(found_text.strip())

In [23]:
# Support method for find_section_with_toc, attempt to determine if the given text is simple a page number (duplicate link in my observations)
def is_text_page_number(question_text):

  # Check argument
  if type(question_text) != str:
    print("Non-string passed to is_text_page_number. Returning True (will result in href being skipped)")
    return True

  # Strip just to be sure
  stripped_question_text = question_text.strip()

  # Check if text is only digits
  if stripped_question_text.isnumeric():
    return True

  # Check if only roman numerals
  valid_romans = ["M", "D", "C", "L", "X", "V", "I", "(", ")"]
  is_roman = True
  for letter in stripped_question_text.upper():
    if letter not in valid_romans:
      is_roman = False
      break

  return is_roman

In [16]:
"""
Use the hyperlinked TOC to find the given text section. Provide a bs4 Tag object for the located TOC. Returns a dictionary the same as its calling function, find_section_in_fulltext:
{
  "MATCHING_SECTION_NAME_FOUND" : "SECTION_TEXT",
  ...
}
"""
def find_section_with_toc(document_soup, toc_soup, target_sections = ()):

  # Returned dictionary
  text_dict = {}

  # First, loop through the <a> tags of the TOC and build a dictionary of href anchor values and text (sections) values
  link_dict = {}
  link_tags = toc_soup.find_all('a', attrs = { 'href' : True })
  for link_tag in link_tags:

    # From some TOC's I have examined, there may be a second <a href...> for each section, labeled instead by the page number. This page number may be a digit or a roman numeral
    # If I come across a filing with a different TOC strcture, I will find a more nuanced way to handle it. For now simply check if the text is only digits or roman numerals
    if is_text_page_number(link_tag.text.strip()):
      continue

    link_dict[link_tag.get('href').replace('#', '')] = clean_column_text(link_tag.text.strip())
  
  # Grab a list of destination anchors (<a> tags with "id" or "name" attribute)
  link_dests = document_soup.find_all('a', attrs = { 'id' : True }) + document_soup.find_all('a', attrs = { 'name' : True })

  # Filter out those which are never linked to, they will obstruct our logic in text_between_tags as we rely on the next anchor to be the beginning of the next section
  # I have run into filings with such "phantom" anchors that are never linked to and can prematurely signal the end of a section 
  # (i.e: https://www.sec.gov/Archives/edgar/data/1331451/000133145118000076/0001331451-18-000076.txt)
  link_dests = [anchor for anchor in link_dests if (anchor.get('id') in link_dict.keys() or anchor.get('name') in link_dict.keys())]

  # Loop through the dictionary of hrefs we built and look for our target sections, storing any found in a new dict
  target_section_links = {}
  for href_val, section_name in link_dict.items():

    for indiv_target in target_sections:
      if indiv_target.lower() in section_name.lower():

        # Add the target section and its href value to target_section_links
        target_section_links[href_val] = indiv_target

  # Now loop through the target sections that we just found links to. We will try to locate the destination of each
  for target_href, target_name in target_section_links.items():

    # The href values are used at their destination in <a> tags with an id/name attribute of the same href value (minus the leading #, why we got rid of it)
    # Loop through the link_dests list of all destination tags, and find the one with id/name=target_href
    num_destinations = len(link_dests)
    for dest_index, link_dest in enumerate(link_dests):

      if (link_dest.get('id') == target_href or link_dest.get('name') == target_href): # Can be either id or name according to HTML spec (see https://stackoverflow.com/questions/484719/should-i-make-html-anchors-with-name-or-id)

        # Grab the text inbetween the current destination tag and the next occuring destination in link_dests
        # If we are on the last destination, grab all the text left
        section_text = ""

        if dest_index + 1 < num_destinations:
          section_text = text_between_tags(link_dest, link_dests[dest_index + 1])
        else:
          section_text = text_starting_at_tag(link_dest)

        if len(section_text):

          # Add to master dict
          text_dict[target_name] = section_text

  return text_dict

In [17]:
# Method to attempt to locate a section and extract its text given an HTML document with no hyperlinked table of contents
def find_section_no_toc(document_soup, target_sections = ()):

  # Forthcoming... I am thinking to look for centered text as the first candidate for section headers, if not found then certain bolded text, etc. Have to look at the structure of more filings before writing
  return {}

In [18]:
""" 
Locate and extract custom text section(s). Takes the path to a filing's full text submission and a list of target sections. Returned structure:
{
  "MATCHING_SECTION_NAME_FOUND" : "SECTION_TEXT",
  ...
}
"""
def find_section_in_fulltext(fulltext_sub, target_sections = ()):

  # Returned dict
  master_text_dict = {}

  # Check that sections were specified
  if len(target_sections) == 0:
    print("No target sections were entered. Provide in a list")
    return master_text_dict

  # Get the file contents
  request_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }
  response = requests.get(url = fulltext_sub, headers = request_headers)
  response.raise_for_status()

  html_content = response.content
  filing_soup = BeautifulSoup(html_content, 'lxml')

  # First task is to break the full text submission into documents by <DOCUMENT> tag
  for document in filing_soup.find_all('document'):

    # Will hold results from current document
    doc_results = {}

    # Only parse HTM/HTML files
    doc_name = document.filename.find(text = True, recursive = False).strip()
    if ".htm" not in doc_name.lower():
      continue

    # Jump into HTML document's contents inside <TEXT>
    doc_html = document.find('text')

    # Parse using TOC if it exists
    # TODO: BETTER PARSING / STORING OF TABLES FOUND IN TEXT SECTIONS
    toc_tag = linked_toc_exists(doc_html)
    if toc_tag:
      doc_results = find_section_with_toc(doc_html, toc_tag, target_sections)
    else:
      doc_results = find_section_no_toc(doc_html, target_sections)

    # Loop through results, add to the master dict
    for result_section_name, result_section_text in doc_results.items():
      
      # If we already have an entry for that target_section, create another ("xxx", "xxx_1", "xxx_2", ...)
      master_key_name = result_section_name
      i = 1 
      while master_key_name in master_text_dict.keys():
        master_key_name = "{}_{}".format(result_section_name, i)
        i += 1

      # Add to dict after finding unused key
      master_text_dict[master_key_name] = result_section_text

  return master_text_dict

In [None]:
# Testing different variations of filings with TOC's
#print(find_section_in_fulltext("https://www.sec.gov/Archives/edgar/data/900075/000090007518000048/0000900075-18-000048.txt", ['risk factors', 'unresolved staff comments'])) # <a name=XXX>
#print(find_section_in_fulltext("https://www.sec.gov/Archives/edgar/data/1331451/000133145118000076/0001331451-18-000076.txt", ['management\'s discussion'])) # <a id=XXX>
#print(find_section_in_fulltext("https://www.sec.gov/Archives/edgar/data/1015780/000101578018000033/0001015780-18-000033.txt", ['risk factors'])) # Has phantom anchors
#print(find_section_in_fulltext("https://www.sec.gov/Archives/edgar/data/1722482/000172248220000090/0001722482-20-000090.txt", ['risk factors'])) # Has linked page numbers including roman numerals in TOC

# Pulling Text From Non-HTML (Legacy) Filings
Forthcoming... More research/inspection of structure to do 