### SEC FILINGS ANALYSIS
#### Ground truth financial health and event triggers
Filing Types:
* 10-K: Annual report, comprehensive overview (financials, business operations, risks). Filed within 60–90 days after fiscal year-end.
* 10-Q: Quarterly report, less detailed but includes financials and updates. Filed within 40–45 days after quarter-end.
* 8-K: Current report for material events (e.g., acquisitions, executive changes, earnings releases). Filed within 4 business days of the event.

In [1]:
import re
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime
from transformers import pipeline

import nltk
from nltk.tokenize import sent_tokenize

### 8-K Analysis

In [11]:
def parse_sec_header(file_content):
    """Extract metadata from SEC-HEADER section."""
    header_data = {
        'accession_number': None,
        'company_name': None,
        'filing_date': None,
        'report_date': None,
        'items': [],
        'cik': None,
        'sic': None
    }
    
    # Extract header section
    header_match = re.search(r'<SEC-HEADER>(.*?)</SEC-HEADER>', file_content, re.DOTALL)
    if not header_match:
        return header_data
    
    header_text = header_match.group(1)
    
    # Extract key fields
    header_data['accession_number'] = re.search(r'ACCESSION NUMBER:\s*(\S+)', header_text).group(1) if re.search(r'ACCESSION NUMBER:\s*(\S+)', header_text) else None
    header_data['company_name'] = re.search(r'COMPANY CONFORMED NAME:\s*(.+)', header_text).group(1) if re.search(r'COMPANY CONFORMED NAME:\s*(.+)', header_text) else None
    header_data['filing_date'] = re.search(r'FILED AS OF DATE:\s*(\d{8})', header_text).group(1) if re.search(r'FILED AS OF DATE:\s*(\d{8})', header_text) else None
    header_data['report_date'] = re.search(r'CONFORMED PERIOD OF REPORT:\s*(\d{8})', header_text).group(1) if re.search(r'CONFORMED PERIOD OF REPORT:\s*(\d{8})', header_text) else None
    header_data['cik'] = re.search(r'CENTRAL INDEX KEY:\s*(\S+)', header_text).group(1) if re.search(r'CENTRAL INDEX KEY:\s*(\S+)', header_text) else None
    header_data['sic'] = re.search(r'STANDARD INDUSTRIAL CLASSIFICATION:\s*.+\[(\d+)\]', header_text).group(1) if re.search(r'STANDARD INDUSTRIAL CLASSIFICATION:\s*.+\[(\d+)\]', header_text) else None
    
    # Extract item information
    items = re.findall(r'ITEM INFORMATION:\s*(.+)', header_text)
    header_data['items'] = items if items else []
    
    return header_data

def parse_body_content(file_content):
    """Extract narrative content from the body, handling HTML/XBRL."""
    # Extract document section
    doc_match = re.search(r'<DOCUMENT>.*?<TEXT>(.*?)</TEXT>', file_content, re.DOTALL)
    if not doc_match:
        return ""
    
    body_text = doc_match.group(1)
    
    # Parse HTML content
    soup = BeautifulSoup(body_text, 'html.parser')
    
    # Remove scripts, styles, and other non-text elements like tables (often contain financial data or boilerplate)
    for script in soup(["script", "style", "table"]):
        script.decompose()
    
    # Extract text
    text = soup.get_text(separator=" ", strip=True)
    
    # Remove boilerplate phrases (common SEC filing patterns)
    boilerplate_phrases = [
        r'UNITED STATES SECURITIES AND EXCHANGE COMMISSION',
        r'Washington, D.C. 20549',
        r'FORM 8-K',
        r'Pursuant to Section 13 or 15\(d\)',
        r'Exact name of Registrant as Specified in Its Charter',
        r'State or Other Jurisdiction of Incorporation',
        r'Commission File Number',
        r'IRS Employer Identification No',
        r'Check the appropriate box below'
    ]
    for phrase in boilerplate_phrases:
        text = re.sub(phrase, '', text, flags=re.IGNORECASE)

    # Clean up excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Split into sentences for sentiment analysis
    sentences = sent_tokenize(text)
    sentences = [s for s in sentences if len(s) > 10]

    return sentences

def process_8k_filings(root_dir):
    """Traverse directory and process all 8-K filings."""
    filings_data = []
    
    for stock_name in os.listdir(root_dir):
        stock_path = os.path.join(root_dir, stock_name, '8-K')
        if not os.path.exists(stock_path):
            continue
            
        for accession_number in os.listdir(stock_path):
            file_path = os.path.join(stock_path, accession_number, 'full-submission.txt')
            if not os.path.exists(file_path):
                continue
                
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                
            # Parse header and body
            header = parse_sec_header(content)
            sentences = parse_body_content(content)
            
            for sentence in sentences:
                filings_data.append({
                    'stock': stock_name,
                    'accession_number': header['accession_number'],
                    'company_name': header['company_name'],
                    'filing_date': header['filing_date'],
                    'report_date': header['report_date'],
                    'cik': header['cik'],
                    'sic': header['sic'],
                    'items': header['items'],
                    'text': sentence
                })
    
    return pd.DataFrame(filings_data)

root_dir = 'sec_finra_data/sec_filings/sec-edgar-filings'
filings_df = process_8k_filings(root_dir)

In [4]:
# Unique Item types of 8-k filings
list(set(item for sublist in filings_df['items'] for item in sublist))

['Regulation FD Disclosure',
 'Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year',
 'Material Modifications to Rights of Security Holders',
 'Material Impairments',
 'Entry into a Material Definitive Agreement',
 'Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing',
 'Results of Operations and Financial Condition',
 'Termination of a Material Definitive Agreement',
 'Other Events',
 'Changes in Control of Registrant',
 'Financial Statements and Exhibits',
 'Unregistered Sales of Equity Securities',
 'Completion of Acquisition or Disposition of Assets',
 'Cost Associated with Exit or Disposal Activities',
 'Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review',
 'Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant',
 'Submission of Matters to a Vote of Security Holders',
 'Triggering Events That Acce

* **Risk Score:** A numerical score based on the potential impact of event types on stock prices.
* **Sentiment Analysis:** FinBERT is used to analyze the sentiment of the narrative text, which can indicate the tone.
* **Date Lag:** Measures delays in reporting, which could suggest intentional timing to influence markets.
* **Event Frequency:** High filing frequency may indicate attempts to manipulate market perception through frequent news.

In [12]:
# Load FinBERT for sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", truncation=True, max_length=512)

def assign_risk_score(items):
    """Assign risk score based on event types."""
    high_risk = ['Changes in Control of Registrant', 'Entry into a Material Definitive Agreement', 'Material Impairments', 'Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers: Compensatory Arrangements of Certain Officers', 'Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing', 'Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement', 'Termination of a Material Definitive Agreement', 'Unregistered Sales of Equity Securities', 'Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review']
    medium_risk = ["Results of Operations and Financial Condition", 'Completion of Acquisition or Disposition of Assets', 'Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year', 'Regulation FD Disclosure', 'Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant', 'Cost Associated with Exit or Disposal Activities']
    
    for item in items:
        if item in high_risk:
            return 3
        if item in medium_risk:
            return 2
    return 1

def get_finbert_sentiment(text):
    """Get sentiment score using FinBERT."""
    try:
        result = sentiment_analyzer(text)[0]  # FinBERT has token limits
        return result['label'], result['score']
    except:
        return "neutral", 0.0

def keyword_sentiment(text):
    """keyword-based sentiment analysis."""
    positive_keywords = ['growth', 'profit', 'increase', 'successful', 'expansion', 'agreement', 'strong']
    negative_keywords = ['loss', 'impairment', 'decline', 'lawsuit', 'restructuring', 'termination', 'weak']
    
    positive_count = sum(1 for word in positive_keywords if word in text.lower())
    negative_count = sum(1 for word in negative_keywords if word in text.lower())
    
    if positive_count > negative_count:
        return "positive", 0.7
    elif negative_count > positive_count:
        return "negative", 0.7
    return "neutral", 0.5

# Feature engineering
filings_df['filing_date'] = pd.to_datetime(filings_df['filing_date'], format='%Y%m%d')
filings_df['report_date'] = pd.to_datetime(filings_df['report_date'], format='%Y%m%d')
filings_df['date_lag'] = (filings_df['filing_date'] - filings_df['report_date']).dt.days
filings_df['risk_score'] = filings_df['items'].apply(assign_risk_score)

# Sentiment analysis on body text
filings_df['finbert_label'], filings_df['finbert_score'] = zip(*filings_df['text'].apply(get_finbert_sentiment))
filings_df['keyword_label'], filings_df['keyword_score'] = zip(*filings_df['text'].apply(keyword_sentiment))



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [15]:
filings_df

Unnamed: 0,stock,accession_number,company_name,filing_date,report_date,cik,sic,items,text,date_lag,risk_score,finbert_label,finbert_score,keyword_label,keyword_score
0,TLRY,0001193125-21-046627,"Tilray, Inc.",2021-02-17,2021-02-17,0001731348,2833,[Results of Operations and Financial Condition...,8-K false 0001731348 0001731348 2021-02-17 202...,0,2,Neutral,0.999959,neutral,0.5
1,TLRY,0001193125-21-046627,"Tilray, Inc.",2021-02-17,2021-02-17,0001731348,2833,[Results of Operations and Financial Condition...,below): Securities registered pursuant to Sect...,0,2,Neutral,0.999814,positive,0.7
2,TLRY,0001193125-21-046627,"Tilray, Inc.",2021-02-17,2021-02-17,0001731348,2833,[Results of Operations and Financial Condition...,Emerging growth company ☐ If an emerging growt...,0,2,Neutral,0.999106,positive,0.7
3,TLRY,0001193125-21-046627,"Tilray, Inc.",2021-02-17,2021-02-17,0001731348,2833,[Results of Operations and Financial Condition...,"☐ On February 17, 2021, Tilray, Inc. (“Tilray”...",0,2,Neutral,0.999969,neutral,0.5
4,TLRY,0001193125-21-046627,"Tilray, Inc.",2021-02-17,2021-02-17,0001731348,2833,[Results of Operations and Financial Condition...,A copy of the press release is furnished herew...,0,2,Neutral,0.999685,neutral,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,PLUG,0001104659-22-110446,PLUG POWER INC,2022-10-21,2022-10-19,0001093691,3620,"[Regulation FD Disclosure, Financial Statement...",In the event the Company is not able to accura...,2,2,Negative,0.964708,neutral,0.5
7471,PLUG,0001104659-22-110446,PLUG POWER INC,2022-10-21,2022-10-19,0001093691,3620,"[Regulation FD Disclosure, Financial Statement...",Investors are cautioned not to unduly rely on ...,2,2,Neutral,0.694875,neutral,0.5
7472,PLUG,0001104659-22-110446,PLUG POWER INC,2022-10-21,2022-10-19,0001093691,3620,"[Regulation FD Disclosure, Financial Statement...",For a further description of the risks and unc...,2,2,Neutral,0.772528,neutral,0.5
7473,PLUG,0001104659-22-110446,PLUG POWER INC,2022-10-21,2022-10-19,0001093691,3620,"[Regulation FD Disclosure, Financial Statement...",The forward-looking statements are made as of ...,2,2,Neutral,0.999123,neutral,0.5


In [30]:
# Aggregate sentiment per filing
def aggregate_sentiment(group):
    """Aggregate sentence-level sentiments to filing-level."""
    finbert_scores = group['finbert_score'].values
    finbert_labels = group['finbert_label'].values
    
    # Weighted average of sentiment scores (weighted by confidence)
    positive_score = sum(score for score, label in zip(finbert_scores, finbert_labels) if label == 'positive') / max(1, sum(1 for label in finbert_labels if label == 'positive'))
    negative_score = sum(score for score, label in zip(finbert_scores, finbert_labels) if label == 'negative') / max(1, sum(1 for label in finbert_labels if label == 'negative'))
    
    # Combine FinBERT and keyword sentiments
    keyword_label = group['keyword_label'].mode()[0]
    
    if positive_score > negative_score and positive_score > 0.5:
        return pd.Series({'sentiment_label': 'positive', 'sentiment_score': positive_score})
    elif negative_score > positive_score and negative_score > 0.5:
        return pd.Series({'sentiment_label': 'negative', 'sentiment_score': negative_score})
    elif keyword_label != 'neutral':
        return pd.Series({'sentiment_label': keyword_label, 'sentiment_score': group['keyword_score'].mean()})
    return pd.Series({'sentiment_label': 'neutral', 'sentiment_score': 0.5})

# Aggregate to filing level
filing_agg_df = filings_df.groupby(['stock', 'accession_number']).apply(aggregate_sentiment).reset_index()

# Convert items list to string to make it hashable
filings_df['items_str'] = filings_df['items'].apply(lambda x: '|'.join(x) if isinstance(x, list) else '')

# Merge back metadata
final_filing_df = filing_agg_df.merge(
    filings_df[['stock', 'accession_number', 'company_name', 'filing_date', 'report_date', 'cik', 'sic', 'items', 'items_str', 'date_lag', 'risk_score']].drop_duplicates(subset=['stock', 'accession_number', 'company_name', 'filing_date', 'report_date', 'cik', 'sic', 'items_str', 'date_lag', 'risk_score']),
    on=['stock', 'accession_number']
)

# Drop the temporary items_str column
final_filing_df = final_filing_df.drop(columns=['items_str'])

final_filing_df

  filing_agg_df = filings_df.groupby(['stock', 'accession_number']).apply(aggregate_sentiment).reset_index()


Unnamed: 0,stock,accession_number,sentiment_label,sentiment_score,company_name,filing_date,report_date,cik,sic,items,date_lag,risk_score
0,AAPL,0000320193-18-000005,neutral,0.5,APPLE INC,2018-02-01,2018-02-01,0000320193,3571,[Results of Operations and Financial Condition...,0,2
1,AAPL,0000320193-18-000067,neutral,0.5,APPLE INC,2018-05-01,2018-05-01,0000320193,3571,[Results of Operations and Financial Condition...,0,2
2,AAPL,0000320193-18-000098,neutral,0.5,APPLE INC,2018-07-31,2018-07-31,0000320193,3571,[Results of Operations and Financial Condition...,0,2
3,AAPL,0000320193-18-000142,neutral,0.5,APPLE INC,2018-11-01,2018-11-01,0000320193,3571,[Results of Operations and Financial Condition...,0,2
4,AAPL,0000320193-19-000002,neutral,0.5,APPLE INC,2019-01-02,2019-01-02,0000320193,3571,[Results of Operations and Financial Condition...,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
447,TSLA,0001564590-23-005126,neutral,0.5,"Tesla, Inc.",2023-04-03,2023-04-02,0001318605,3711,[Results of Operations and Financial Condition...,1,2
448,TSLA,0001564590-23-005462,neutral,0.5,"Tesla, Inc.",2023-04-05,2023-03-30,0001318605,3711,[Amendments to Articles of Incorporation or By...,6,2
449,TSLA,0001564590-23-005959,neutral,0.5,"Tesla, Inc.",2023-04-19,2023-04-19,0001318605,3711,[Results of Operations and Financial Condition...,0,2
450,TSLA,0001564590-23-007379,neutral,0.5,"Tesla, Inc.",2023-05-22,2023-05-16,0001318605,3711,[Submission of Matters to a Vote of Security H...,6,1


In [33]:
final_filing_df['sentiment_label'].value_counts()

sentiment_label
neutral     420
positive     31
negative      1
Name: count, dtype: int64

In [31]:
# Event frequency per stock per month
final_filing_df['month'] = final_filing_df['filing_date'].dt.to_period('M')
event_freq = final_filing_df.groupby(['stock', 'month']).size().reset_index(name='event_count')
final_filing_df = final_filing_df.merge(event_freq, on=['stock', 'month'], how='left')

final_filing_df.to_csv('sec_finra_data/sec_filings/engineered_8k_filings.csv', index=False)