Importing needed packages

In [None]:
import re
import pandas as pd
import csv
from sec_api import ExtractorApi, QueryApi

Calling in the API key. This key is generated from the sec-api.oi website.
The API_KEY is a string that serves as an authentication token, granting the user access to the API's services.

In [None]:
API_KEY = 'your own API key'

Specifiy the location of your file and the desitation of text extracted.

In [None]:
DATA_FILE = './data.xlsx'

OUTPUT_FOLDER = '/output_folder/'

ExtractorApi is designed to extract specific information from SEC filings, such as 10-Ks.

QueryApi is used to perform queries or searches in the SEC database.

In [None]:
# Instantiate the API classes
extractorApi = ExtractorApi(API_KEY)
queryApi = QueryApi(API_KEY)

Reading the data file into a dataframe and saving the list of tickers of firms of interest to used in a for loop for text extraction. 

In [None]:
df = pd.read_excel(DATA_FILE)

list_of_tickers = list(df['ticker_symbol'].dropna().astype('str'))

Code for extraction

In [None]:

# Precompile the regular expression
clean_pattern = re.compile(r"\n|&#[0-9]+;")

# Process the tickers
for ticker in list_of_tickers:
    query = {
        "query": {"query_string": {
            "query": f"formType:\"10-K\" AND ticker:\"{ticker}\"",
        }},
        "from": "0",
        "sort": [{"filedAt": {"order": "desc"}}]
    }
    
    response = queryApi.get_filings(query)
    filings = response.get('filings', [])
    
    if not filings:
        continue
    
    # Initialize an empty string to store all concatenated texts
    concatenated_texts = ""
    
    # Extract the sections for each filing
    for filing in filings:
        for item in ['1', '7', '9', '9A', '9B']:
            try:
                section_text = extractorApi.get_section(filing['linkToFilingDetails'], item, 'text')
                concatenated_texts += section_text
            except Exception as e:
                continue  # Handle other actions or logging
    
    # Clean the concatenated text
    cleaned_text = clean_pattern.sub("", concatenated_texts)
    
    # Write the cleaned text to a file
    file_path = f"{OUTPUT_FOLDER}{ticker}.txt"
    with open(file_path, 'w') as output_file:
        output_file.write(cleaned_text)
