# SEC SECTION EXTRACTOR and VISUALIZATION FUNCTIONS

Helper functions needed for section extraction. 

## Code for extracting the "Item" sections from the forms

In [None]:
# %%capture
!pip install transformers==4.11.3 --ignore-installed ruamel-yaml
!pip install torch==1.9.1 --no-cache-dir
!pip install altair==4.1.0
!pip install vega==3.5.0

In [None]:
from transformers import pipeline
import pandas as pd
import re
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

In [None]:
def extract_items(part_header, part_text, form_type):
    """Extracts the item header and its corresponding text for every item within the plain text of a "part" of a form.

    :type part_header: str
    :param part_header: The header of a "part" of a form (e.g. Part III)

    :type part_text: str
    :param part_text: The plain text of a "part" of a form (e.g. Part III). In the case of 10-K and 8-K forms, the "part" is the whole form.

    :type form_type: str
    :param form_type: The form type (e.g. 10-K, 10-Q, 8-K)

    :rtype: Iterator[(str, str, str)]
    :returns: An iterator over tuples of the form (part_header, item_header, text)
        where "item_header" is the item header and "text" is the corresponding text
        for each item in the "part". part_header is included to differentiate
        between portions of a filing that have the same item number but are in different parts.
    """
    if form_type == "10-K" or form_type == "10-Q":
        pattern = "(?P<header>(\n\n(ITEM|Item) \d+[A-Z]*.*?)\n\n)(?P<text>.*?)(?=(\n\n(ITEM|Item) \d+[A-Z]*.*?)\n\n|$)"
    elif form_type == "8-K":
        pattern = "(?P<header>\n\n(ITEM|Item) \d+\.\d+\.*)(?P<text>.*?)(?=((\n\n(ITEM|Item) \d+\.\d+.*?)\n\n|$))"
    return (
        (part_header, _.group("header").strip(), _.group("text").strip())
        for _ in re.finditer(pattern, part_text, re.DOTALL)
    )

In [None]:
def extract_parts(form_text, form_type):
    """Extracts every part from form plain text, where a "part" is defined
    specifically as a portion in the form starting with "PART (some roman numeral)".

    :type form_text: str
    :param form_text: The form plain text.

    :type form_type: str
    :param form_type: The form type (e.g. 10-K, 10-Q, 8-K)

    :rtype: Iterator[(str, str)]
    :returns: An iterator over the header and text for each part extracted from the form plain text.
        (e.g. for 10-K forms, we iterate through Part I through Part IV)
    """
    pattern = "((^PART|^Part|\n\nPART|\n\nPart) [IVXLCDM]+).*?(\n\n.*?)(?=\n\n(PART|Part) [IVXLCDM]+.*?\n\n|$)"
    return ((_.group(1).strip(), _.group(3)) for _ in re.finditer(pattern, form_text, re.DOTALL))

In [None]:
def get_form_items(form_text, form_type):
    """Extracts the item header and its corresponding text for every item within a form's plaintext.

    :type form_text: str
    :param form_text: The form plain text.

    :type form_type: str
    :param form_type: The form type (e.g. 10-K, 10-Q, 8-K)

    :rtype: Iterator[(str, str)]
    :returns: An iterator over tuples of the form (header, text) where "header" is the item header and "text" is the corresponding text.
    """
    if form_type == "10-Q":
        for part_header, part_text in extract_parts(form_text, form_type):
            items = extract_items(part_header, part_text, form_type)
            yield from items
    elif form_type == "8-K" or form_type == "10-K":
        items = extract_items("", form_text, form_type)
        yield from items

## Code for building a dataframe whose columns are the different "Item" sections

In [None]:
def items_to_df_row(item_iter, columns, form_type):
    """Takes an iterator over tuples of the form (header, text) that is created from calling extract_items
    and generates a row for a dataframe that has a column for each of the item types.

    :type item_iter: Iterator[(str, str, str)]
    :param item_iter: An iterator over tuples of the form (part_header, item_header, item_text).

    :type columns: List[str]
    :param columns: A list of column names for the dataframe we wish to generate a row for.

    :type form_type: str
    :param form_type: The form type. Currently supported types include 10-K, 10-Q, 8-K.

    :rtype: List[str]
    :returns: A row for the dataframe.
    """
    mapping = {}  # mapping between processed column names and their corresponding row index
    for idx, col_name in enumerate(columns):
        processed_col_name = col_name.lower()
        mapping[processed_col_name] = idx

    returned_row = ["" for i in range(len(columns))]
    for part_header, item_header, text in item_iter:
        processed_header = (part_header.lower() + " " + item_header.lower()).strip()
        if form_type == "10-Q":
            processed_header = re.search("part [ivxlcdm]+ item \d+[a-z]*", processed_header).group(
                0
            )
        elif form_type == "10-K":
            processed_header = re.search("item \d+[a-z]*", processed_header).group(0)
        elif form_type == "8-K":
            if processed_header[-1] == ".":
                processed_header = processed_header[
                    :-1
                ]  # Some companies will include a period at the end of the header while others don't
        if processed_header in mapping.keys():
            row_index = mapping[processed_header]
            returned_row[row_index] = text

    return returned_row

## Required hard-coded values for the different Item section header names

In [None]:
columns_10K = [
    "Item 1",
    "Item 1A",
    "Item 1B",
    "Item 2",
    "Item 3",
    "Item 4",
    "Item 5",
    "Item 6",
    "Item 7",
    "Item 7A",
    "Item 8",
    "Item 9",
    "Item 9A",
    "Item 9B",
    "Item 10",
    "Item 11",
    "Item 12",
    "Item 13",
    "Item 14",
    "Item 15",
]

In [None]:
columns_10Q = [
    "Part I Item 1",
    "Part I Item 2",
    "Part I Item 3",
    "Part I Item 4",
    "Part II Item 1",
    "Part II Item 1A",
    "Part II Item 2",
    "Part II Item 3",
    "Part II Item 4",
    "Part II Item 5",
    "Part II Item 6",
]

In [None]:
columns_8K = [
    "Item 1.01",
    "Item 1.02",
    "Item 1.03",
    "Item 1.04",
    "Item 2.01",
    "Item 2.02",
    "Item 2.03",
    "Item 2.04",
    "Item 2.05",
    "Item 2.06",
    "Item 3.01",
    "Item 3.02",
    "Item 3.03",
    "Item 4.01",
    "Item 4.02",
    "Item 5.01",
    "Item 5.02",
    "Item 5.03",
    "Item 5.04",
    "Item 5.05",
    "Item 5.06",
    "Item 5.07",
    "Item 5.08",
    "Item 6.01",
    "Item 6.02",
    "Item 6.03",
    "Item 6.04",
    "Item 6.05",
    "Item 7.01",
    "Item 8.01",
    "Item 9.01",
]

In [None]:
header_mappings_10K = {
    "Item 1": "Business",
    "Item 1A": "Risk Factors",
    "Item 1B": "Unresolved Staff Comments",
    "Item 2": "Properties",
    "Item 3": "Legal Proceedings",
    "Item 4": "Mine Safety Disclosures",
    "Item 5": "Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "Item 6": "Selected Financial Data",
    "Item 7": "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
    "Item 7A": "Quantitative and Qualitative Disclosures about Market Risk",
    "Item 8": "Financial Statements and Supplementary Data",
    "Item 9": "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
    "Item 9A": "Controls and Procedures",
    "Item 9B": "Other Information",
    "Item 10": "Directors, Executive Officers and Corporate Governance",
    "Item 11": "Executive Compensation",
    "Item 12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "Item 13": "Certain Relationships and Related Transactions, and Director Independence",
    "Item 14": "Principal Accountant Fees and Services",
    "Item 15": "Exhibits, Financial Statement Schedules",
}

In [None]:
header_mappings_10Q = {
    "Part I Item 1": "Financial Statements",
    "Part I Item 2": "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
    "Part I Item 3": "Quantitative and Qualitative Disclosures About Market Risk",
    "Part I Item 4": "Controls and Procedures",
    "Part II Item 1": "Legal Proceedings",
    "Part II Item 1A": "Risk Factors",
    "Part II Item 2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "Part II Item 3": "Defaults Upon Senior Securities",
    "Part II Item 4": "Mine Safety Disclosures",
    "Part II Item 5": "Other Information",
    "Part II Item 6": "Exhibits",
}

In [None]:
header_mappings_8K = {
    "Item 1.01": "Entry into a Material Definitive Agreement",
    "Item 1.02": "Termination of a Material Definitive Agreement",
    "Item 1.03": "Bankruptcy or Receivership",
    "Item 1.04": "Mine Safety - Reporting of Shutdowns and Patterns of Violations",
    "Item 2.01": "Completion of Acquisition or Disposition of Assets",
    "Item 2.02": "Results of Operations and Financial Condition",
    "Item 2.03": "Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant",
    "Item 2.04": "Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement",
    "Item 2.05": "Costs Associated with Exit or Disposal Activities",
    "Item 2.06": "Material Impairments",
    "Item 3.01": "Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing",
    "Item 3.02": "Unregistered Sales of Equity Securities",
    "Item 3.03": "Material Modification to Rights of Security Holders",
    "Item 4.01": "Changes in Registrant's Certifying Accountant",
    "Item 4.02": "Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review",
    "Item 5.01": "Changes in Control of Registrant",
    "Item 5.02": "Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers",
    "Item 5.03": "Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year",
    "Item 5.04": "Temporary Suspension of Trading Under Registrant's Employee Benefit Plans",
    "Item 5.05": "Amendment to Registrant's Code of Ethics, or Waiver of a Provision of the Code of Ethics",
    "Item 5.06": "Change in Shell Company Status",
    "Item 5.07": "Submission of Matters to a Vote of Security Holders",
    "Item 5.08": "Shareholder Director Nominations",
    "Item 6.01": "ABS Informational and Computational Material",
    "Item 6.02": "Change of Servicer or Trustee",
    "Item 6.03": "Change in Credit Enhancement or Other External Support",
    "Item 6.04": "Failure to Make a Required Distribution",
    "Item 6.05": "Securities Act Updating Disclosure",
    "Item 7.01": "Regulation FD Disclosure",
    "Item 8.01": "Other Events",
    "Item 9.01": "Financial Statements and Exhibits",
}

In [None]:
## Summarizer code
summarizer = pipeline("summarization")  # Uses the HF model

# Set the max no of tokens (way less than 1000 because BERT does word-piece tokenization that splits words)
# So to be safe, we keep it as below. https://paperswithcode.com/method/wordpiece
max_seq_len = 500  # just less than 1024
summary_pct = 0.10  # size of summary

# Count the number of tokens from wordpiece tokenization
def numtokens(txt):  # method to find how many tokens are in a passed string
    x = summarizer.tokenizer(txt, return_length=True)
    return x.length


# Summarizer for long docs. Broken into two functions
# partSummary takes the first set of paragraphs that are within max_seq_len and gives back summary and remaining paragraphs
# fullSummary keeps calling partSummary until there are no paragraphs left. It joins all the part summaries to give a final summary
def partSummary(txt):
    numt = numtokens(txt)[0]  # number of tokens of the passed text
    if numt < max_seq_len:  # if the passed text is already less than the max_seq_len, summarize it
        summary = summarizer(
            txt, min_length=int(numt * summary_pct / 2), max_length=int(numt * summary_pct)
        )
        txt = ""
    else:
        paragraph_list = txt.split("\n\n")  # make separate paragraphs
        lenp = [
            numtokens(para)[0] for para in paragraph_list
        ]  # get the number of tokens in each paragraph
        if lenp[0] > max_seq_len:  # Special handling if first paragraph exceeds max_seq_len
            sent_list = sent_tokenize(paragraph_list[0])  # split paragraph into sentences
            lens = [
                numtokens(sent)[0] for sent in sent_list
            ]  # get the number of tokens in each sentence
            if (
                lens[0] > max_seq_len
            ):  # special handling if the first "sentence" of the paragraph exceeds max_seq_len
                paragraph_list.pop(0)  # skip summarizing this paragraph - summarize an empty string
                summary = summarizer("", min_length=0, max_length=2)
                return summary, ("\n\n".join(paragraph_list))
            else:  # split this larger paragraph into smaller paragraphs by adding the sentences
                idx = where(cumsum(lens) > max_seq_len)[
                    0
                ].min()  # get the index where the sentences cross max_seq_len
                x1 = " ".join(
                    sent_list[:idx]
                )  # create the first paragraph, which is suitable for summarization
                x2 = " ".join(
                    sent_list[idx:]
                )  # create the second paragraph, using the rest of the original paragraph
                paragraph_list[0] = x2  # insert these two paragraphs back into the paragraph list
                paragraph_list.insert(0, x1)
        lenp = [
            numtokens(para)[0] for para in paragraph_list
        ]  # get the number of tokens in each paragraph
        idx = where(cumsum(lenp) > max_seq_len)[
            0
        ].min()  # get the index where it crosses max_seq_len
        to_be_summ = "\n\n".join(paragraph_list[:idx])  # fast way to append the paras
        numt = numtokens(to_be_summ)[0]
        # print('[', numt, ']', end = '..')
        summary = summarizer(
            to_be_summ, min_length=int(numt * summary_pct / 2), max_length=int(numt * summary_pct)
        )
        txt = "\n\n".join(paragraph_list[idx:])  # return remaining paras to be summarized
    return summary, txt


def fullSummary(txt):
    final_summary = ""
    while len(txt) > 0:
        # print ('lentext =', len(txt), end = '..')
        summary, txt = partSummary(txt)
        final_summary = final_summary + summary[0]["summary_text"]
    return final_summary

In [None]:
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from vega import Vega
from sklearn.preprocessing import minmax_scale
import numpy as np

## Correlogram

In [None]:
def corrfunc(x, y, ax=None, **kws):
    r, _ = pearsonr(x, y)
    ax = ax or plt.gca()
    ax.annotate(f"p = {r:.2f}", xy=(0.1, 0.9), xycoords=ax.transAxes)

In [None]:
def createCorrelogram(df):
    g = sns.pairplot(df, corner=True)
    g.map_lower(corrfunc)
    show()

In [None]:
def createHeatmap(df):
    sns.heatmap(df.corr(), annot=True)
    plt.show()

## Vega RadarChart

In [None]:
def Vega(spec):
    bundle = {}
    bundle["application/vnd.vega.v5+json"] = spec
    display(bundle, raw=True)

In [None]:
# min_max scaling done here
def scaleScores(scores):
    df = scores.copy()
    cols = sorted(df.columns[1:-1])
    df = df[cols]
    scores_scaled = minmax_scale(df)  # scoring against other scores in the same NLP column
    return scores_scaled

In [None]:
# truncates a number
def truncate(n, decimal):
    multiplier = 10**decimal
    return int(n * multiplier) / multiplier

In [None]:
def createRadarChart(scores, tenk1_index, tenk2_index):
    scores_scaled = scaleScores(scores)
    radarchart = Vega(
        {
            "$schema": "https://vega.github.io/schema/vega/v5.json",
            "description": "A radar chart example, showing multiple dimensions in a radial layout.",
            "width": 400,
            "height": 400,
            "padding": 80,
            "autosize": {"type": "none", "contains": "padding"},
            "signals": [{"name": "radius", "update": "width / 2"}],
            "data": [
                {
                    "name": "table",
                    "values": [
                        {
                            "key": "certainty",
                            "value": truncate(scores_scaled[tenk1_index][0], 2),
                            "category": 1,
                        },
                        {
                            "key": "fraud",
                            "value": truncate(scores_scaled[tenk1_index][1], 2),
                            "category": 1,
                        },
                        {
                            "key": "litigious",
                            "value": truncate(scores_scaled[tenk1_index][2], 2),
                            "category": 1,
                        },
                        {
                            "key": "negative",
                            "value": truncate(scores_scaled[tenk1_index][3], 2),
                            "category": 1,
                        },
                        {
                            "key": "polarity",
                            "value": truncate(scores_scaled[tenk1_index][4], 2),
                            "category": 1,
                        },
                        {
                            "key": "positive",
                            "value": truncate(scores_scaled[tenk1_index][5], 2),
                            "category": 1,
                        },
                        {
                            "key": "readability",
                            "value": truncate(scores_scaled[tenk1_index][6], 2),
                            "category": 1,
                        },
                        {
                            "key": "risk",
                            "value": truncate(scores_scaled[tenk1_index][7], 2),
                            "category": 1,
                        },
                        {
                            "key": "safe",
                            "value": truncate(scores_scaled[tenk1_index][8], 2),
                            "category": 1,
                        },
                        {
                            "key": "sentiment",
                            "value": truncate(scores_scaled[tenk1_index][9], 2),
                            "category": 1,
                        },
                        {
                            "key": "uncertainty",
                            "value": truncate(scores_scaled[tenk1_index][10], 2),
                            "category": 1,
                        },
                        {
                            "key": "certainty",
                            "value": truncate(scores_scaled[tenk2_index][0], 2),
                            "category": 2,
                        },
                        {
                            "key": "fraud",
                            "value": truncate(scores_scaled[tenk2_index][1], 2),
                            "category": 2,
                        },
                        {
                            "key": "litigious",
                            "value": truncate(scores_scaled[tenk2_index][2], 2),
                            "category": 2,
                        },
                        {
                            "key": "negative",
                            "value": truncate(scores_scaled[tenk2_index][3], 2),
                            "category": 2,
                        },
                        {
                            "key": "polarity",
                            "value": truncate(scores_scaled[tenk2_index][4], 2),
                            "category": 2,
                        },
                        {
                            "key": "positive",
                            "value": truncate(scores_scaled[tenk2_index][5], 2),
                            "category": 2,
                        },
                        {
                            "key": "readability",
                            "value": truncate(scores_scaled[tenk2_index][6], 2),
                            "category": 2,
                        },
                        {
                            "key": "risk",
                            "value": truncate(scores_scaled[tenk2_index][7], 2),
                            "category": 2,
                        },
                        {
                            "key": "safe",
                            "value": truncate(scores_scaled[tenk2_index][8], 2),
                            "category": 2,
                        },
                        {
                            "key": "sentiment",
                            "value": truncate(scores_scaled[tenk2_index][9], 2),
                            "category": 2,
                        },
                        {
                            "key": "uncertainty",
                            "value": truncate(scores_scaled[tenk2_index][10], 2),
                            "category": 2,
                        },
                    ],
                },
                {
                    "name": "keys",
                    "source": "table",
                    "transform": [{"type": "aggregate", "groupby": ["key"]}],
                },
            ],
            "scales": [
                {
                    "name": "angular",
                    "type": "point",
                    "range": {"signal": "[-PI, PI]"},
                    "padding": 0.5,
                    "domain": {"data": "table", "field": "key"},
                },
                {
                    "name": "radial",
                    "type": "linear",
                    "range": {"signal": "[0, radius]"},
                    "zero": True,
                    "nice": False,
                    "domain": {"data": "table", "field": "value"},
                    "domainMin": 0,
                },
                {
                    "name": "color",
                    "type": "ordinal",
                    "domain": {"data": "table", "field": "category"},
                    "range": {"scheme": "category10"},
                },
            ],
            "encode": {"enter": {"x": {"signal": "radius"}, "y": {"signal": "radius"}}},
            "marks": [
                {
                    "type": "group",
                    "name": "categories",
                    "zindex": 1,
                    "from": {"facet": {"data": "table", "name": "facet", "groupby": ["category"]}},
                    "marks": [
                        {
                            "type": "line",
                            "name": "category-line",
                            "from": {"data": "facet"},
                            "encode": {
                                "enter": {
                                    "interpolate": {"value": "linear-closed"},
                                    "x": {
                                        "signal": "scale('radial', datum.value) * cos(scale('angular', datum.key))"
                                    },
                                    "y": {
                                        "signal": "scale('radial', datum.value) * sin(scale('angular', datum.key))"
                                    },
                                    "stroke": {"scale": "color", "field": "category"},
                                    "strokeWidth": {"value": 1},
                                    "fill": {"scale": "color", "field": "category"},
                                    "fillOpacity": {"value": 0.1},
                                }
                            },
                        },
                        {
                            "type": "text",
                            "name": "value-text",
                            "from": {"data": "category-line"},
                            "encode": {
                                "enter": {
                                    "x": {"signal": "datum.x"},
                                    "y": {"signal": "datum.y"},
                                    "text": {"signal": "datum.datum.value"},
                                    "align": {"value": "center"},
                                    "baseline": {"value": "middle"},
                                    "fill": {"value": "black"},
                                }
                            },
                        },
                    ],
                },
                {
                    "type": "rule",
                    "name": "radial-grid",
                    "from": {"data": "keys"},
                    "zindex": 0,
                    "encode": {
                        "enter": {
                            "x": {"value": 0},
                            "y": {"value": 0},
                            "x2": {"signal": "radius * cos(scale('angular', datum.key))"},
                            "y2": {"signal": "radius * sin(scale('angular', datum.key))"},
                            "stroke": {"value": "lightgray"},
                            "strokeWidth": {"value": 1},
                        }
                    },
                },
                {
                    "type": "text",
                    "name": "key-label",
                    "from": {"data": "keys"},
                    "zindex": 1,
                    "encode": {
                        "enter": {
                            "x": {"signal": "(radius + 5) * cos(scale('angular', datum.key))"},
                            "y": {"signal": "(radius + 5) * sin(scale('angular', datum.key))"},
                            "text": {"field": "key"},
                            "align": [
                                {
                                    "test": "abs(scale('angular', datum.key)) > PI / 2",
                                    "value": "right",
                                },
                                {"value": "left"},
                            ],
                            "baseline": [
                                {"test": "scale('angular', datum.key) > 0", "value": "top"},
                                {"test": "scale('angular', datum.key) == 0", "value": "middle"},
                                {"value": "bottom"},
                            ],
                            "fill": {"value": "black"},
                            "fontWeight": {"value": "bold"},
                        }
                    },
                },
                {
                    "type": "line",
                    "name": "outer-line",
                    "from": {"data": "radial-grid"},
                    "encode": {
                        "enter": {
                            "interpolate": {"value": "linear-closed"},
                            "x": {"field": "x2"},
                            "y": {"field": "y2"},
                            "stroke": {"value": "lightgray"},
                            "strokeWidth": {"value": 1},
                        }
                    },
                },
            ],
        }
    )

In [None]:
# Install R for creating an interactive screening table
!apt-get update
!apt-get -y install r-base r-base-dev

In [None]:
print("FINISHED FUNCTION INSTALL.")