In [1]:
# HTTP lib
import requests

# Used to decide what quarter we are in
from datetime import date


The following guidance is given by the SEC website on the daily and full index files (from https://www.sec.gov/os/accessing-edgar-data):
```
Using the EDGAR index files

Indexes to all public filings are available from 1994Q3 through the present and located in the following browsable directories:

    /Archives/edgar/daily-index — daily index files through the current year;
    /Archives/edgar/full-index — full indexes offer a "bridge" between quarterly and daily indexes, compiling filings from the beginning of the current quarter through the previous business day. At the end of the quarter, the full index is rolled into a static quarterly index.

Each directory and all child subdirectories contain three files to assist in automated crawling of these directories. Note that these are not visible through directory browsing.

    index.html (the web browser would normally receive these)
    index.xml (an XML structured version of the same content)
    index.json (a JSON structured vision of the same content)

The EDGAR indexes list the following information for each filing:

    company name
    form type
    central index key (CIK)
    date filed
    file name (including folder path)

Four types of indexes are available:

    company — sorted by company name
    form — sorted by form type
    master — sorted by CIK number
    XBRL — list of submissions containing XBRL financial files, sorted by CIK number; these include Voluntary Filer Program submissions

The company, form, and master indexes contain the same information sorted differently
```
Additionally:



```
Paths and directory structure

The index paths link to the raw text version of the complete disseminated filing content, for example:

    /Archives/edgar/data/1122304/0001193125-15-118890.txt

Post-EDGAR 7.0 filings (after May 26, 2000) are also accessible via an alternative symbolic path, incorporating an intermediate accession-number directory without dashes. All the documents submitted for a given filing will be in this directory:

    /Archives/edgar/data/1122304/000119312515118890/0001193125-15-118890.txt

Other content that may be of interest using the root path:

    /Archives/edgar/data/1122304/0001193125-15-118890-index.html — an HTML version including hyperlinked table of submitted documents.
    /Archives/edgar/data/1122304/000119312515118890/0001193125-15-118890.hdr.sgml —
    the SGML header contents. Note the additional "accession-number-without-dashes" directory in the path.

```



We primarily want to be able to search these indexes by CIK, form type, or date range. We will focus on the "master" index format as it is the most easily parsed. The XBRL indexes (only offered under full-index) may also be of interest. Simply replace "master.idx" with "xbrl.idx" in the below methods. IDX file structure and delimiters are the same as master.idx shown below:


# Full-index master.idx format:


```
Description:           Master Index of EDGAR Dissemination Feed
Last Data Received:    June 30, 2022
Comments:              webmaster@sec.gov
Anonymous FTP:         ftp://ftp.sec.gov/edgar/
Cloud HTTP:            https://www.sec.gov/Archives/

 
 
 
CIK|Company Name|Form Type|Date Filed|Filename
--------------------------------------------------------------------------------
1000045|NICHOLAS FINANCIAL INC|10-K|2022-06-24|edgar/data/1000045/0000950170-22-012061.txt
1000045|NICHOLAS FINANCIAL INC|3|2022-05-19|edgar/data/1000045/0001929257-22-000001.txt
1000045|NICHOLAS FINANCIAL INC|4|2022-05-11|edgar/data/1000045/0001000045-22-000003.txt
...
```
Our lines of interest begin after a line of many consecutive "-" characters followed by a newline. Each value/column of the lines are separated by pipe characters ("|")



# Daily-index master.idx format:


```
Description:           Daily Index of EDGAR Dissemination Feed
Last Data Received:    Sep 26, 2022
Comments:              webmaster@sec.gov
Anonymous FTP:         ftp://ftp.sec.gov/edgar/
 
CIK|Company Name|Form Type|Date Filed|File Name
--------------------------------------------------------------------------------
1000228|HENRY SCHEIN INC|4|20220926|edgar/data/1000228/0001209191-22-051256.txt
1000275|ROYAL BANK OF CANADA|424B2|20220926|edgar/data/1000275/0001140361-22-034664.txt
```
Only difference is no dashes are in the date. It is in the format YYYYMMDD.


In [2]:
# Support method used below. Given a date object, determines the quarter number. 
# Quarters are literally just divided into four 3-month groups: Jan-Mar, Apr-Jun, Jul-Sep, Oct-Dec (month 1-3, 4-6, 7-9, 10-12)
def get_quarter_from_date(date_obj):
  if (date_obj.month >= 1 and date_obj.month <= 3):
    return 1
  elif (date_obj.month >= 4 and date_obj.month <= 6):
    return 2
  elif (date_obj.month >= 7 and date_obj.month <= 9):
    return 3
  elif (date_obj.month >= 10 and date_obj.month <= 12):
    return 4
  else:
    print("Failed to determine quarter from given date.")
  
  return 0

# Filtering filings by CIK, company name, form type, or date

This first method filters the given (year and) quarter's master index to return a list of filings according to a given filter type and list of target values. The next method achieves the same thing but using the daily-index instead.

In [None]:
"""
filter_type = "cik", "company", "type", or "date"
unique_vals = [] # List of unique values (CIKs, company names, types, or dates). NOTE: Dates should be passed as a list of date objects

Returns a list of dictionaries of the following structure:
{
  "cik" : "CIK_NUM",
  "company" : "COMPANY_NAME",
  "type" : "FILING_TYPE",
  "date" : "YYYY-MM-DD",
  "fulltext_path" : ".../edgar/data/CIK/ETC"
}
"""
def filter_quarter_by_param(target_year, target_quarter, filter_type, unique_vals = ()):

  filings_list = []

  # Figure out what kind of filter and format the unique values if needed
  clean_unique_vals = []
  filter_column = 0

  if filter_type == "cik":
    # Strip leading 0's for consistency. The IDX files won't include them from what I've seen.
    for i in unique_vals:
      clean_unique_vals.append(str(i).lstrip("0"))

  elif filter_type == "company":
    filter_column = 1
    for i in unique_vals:
      clean_unique_vals.append(i.lower())

  elif filter_type == "type":
    filter_column = 2
    for i in unique_vals:
      clean_unique_vals.append(i.lower())

  elif filter_type == "date":
    filter_column = 3
    # Convert date structures into "YYYY-MM-DD" strings
    try:
      for d in unique_vals:
        clean_unique_vals.append("{}-{}-{}".format(d.year, str(d.month).zfill(2), str(d.day).zfill(2)))
    except:
      print("Invalid filter date objects passed.")
      return filings_list
  
  else:
    print("Invalid quarterly filter type. Must be cik or type: {}".format(filter_type))
    return filings_list

  # URLs and UA for full-index
  qtr_index_url = r"https://www.sec.gov/Archives/edgar/full-index/{}/QTR{}/master.idx".format(target_year, target_quarter)
  base_archives = "https://www.sec.gov/Archives/"
  req_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

  # Get the .idx file
  resp = requests.get(url = qtr_index_url, headers = req_headers)
  resp.raise_for_status()

  # First separate the header from the lines of content. Look for twenty dashes in a row followed by a newline
  idx_full_text = resp.text
  split_idx = idx_full_text.split("--------------------\n")

  # Loop through lines of data
  try:
    for line in split_idx[1].splitlines():

      # CIK|Company Name|Form Type|Date Filed|Filename
      # We expect 5 columns
      columns = line.split("|")
      if len(columns) == 5:

        if columns[filter_column].lower() in clean_unique_vals:
          # Build a dictionary for the filing if we find match
          found_filing = {}
          found_filing["cik"] = columns[0].zfill(10)
          found_filing["company"] = columns[1]
          found_filing["type"] = columns[2]
          found_filing["date"] = columns[3]
          found_filing["fulltext_path"] = base_archives + columns[4]

          # Append it
          filings_list.append(found_filing)
  
  except:
    print("IDX file was in unexpected format, error parsing")

  return filings_list

In [None]:
todays_date = date.today()
qtrnum = get_quarter_from_date(todays_date)

if (qtrnum):
  print(filter_quarter_by_param(todays_date.year, qtrnum, "cik", [320193]))

In [None]:
"""
Similar to filter_quarter_by_param, but targets the daily-index of the given date object target_date. 
filter_type, unique_vals, and return list structure are all the same as filter_quarter_by_param
"""
def filter_day_by_param(target_date, filter_type, unique_vals = ()):

  filings_list = []

  # Verify that target_date is of the right type
  if type(target_date) != date:
    print("Invalid target date format (not a date object).")
    return filings_list

  # Figure out what kind of filter and format the unique values if needed
  clean_unique_vals = []
  filter_column = 0

  if filter_type == "cik":
    # Strip leading 0's for consistency. The IDX files won't include them from what I've seen.
    for i in unique_vals:
      clean_unique_vals.append(str(i).lstrip("0"))

  elif filter_type == "company":
    filter_column = 1
    for i in unique_vals:
      clean_unique_vals.append(i.lower())

  elif filter_type == "type":
    filter_column = 2
    for i in unique_vals:
      clean_unique_vals.append(i.lower())

  elif filter_type == "date":
    filter_column = 3
    # Convert date structures into "YYYY-MM-DD" strings
    try:
      for d in unique_vals:
        clean_unique_vals.append("{}-{}-{}".format(d.year, str(d.month).zfill(2), str(d.day).zfill(2)))
    except:
      print("Invalid filter date objects passed.")
      return filings_list

  # Build path
  daily_index_url = r"https://www.sec.gov/Archives/edgar/daily-index/{}/QTR{}/master.{}{}{}.idx".format(target_date.year, get_quarter_from_date(target_date), target_date.year, \
                                                                                                       str(target_date.month).zfill(2), str(target_date.day).zfill(2)) # master.YYYYMMDD.idx
  base_archives = "https://www.sec.gov/Archives/"
  req_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

  # Get the file
  resp = requests.get(url = daily_index_url, headers = req_headers)
  resp.raise_for_status()

  # Separate the header
  idx_full_text = resp.text
  split_idx = idx_full_text.split("--------------------\n")

  # Loop through lines of data
  try:
    for line in split_idx[1].splitlines():

      # CIK|Company Name|Form Type|Date Filed|Filename
      # We expect 5 columns
      columns = line.split("|")
      if len(columns) == 5:

        if columns[filter_column].lower() in clean_unique_vals:
          # Build a dictionary for the filing if we find match
          found_filing = {}
          found_filing["cik"] = columns[0].zfill(10)
          found_filing["company"] = columns[1]
          found_filing["type"] = columns[2]
          found_filing["date"] = columns[3]
          found_filing["fulltext_path"] = base_archives + columns[4]

          # Append it
          filings_list.append(found_filing)
  
  except:
    print("IDX file was in unexpected format, error parsing")

  return filings_list

In [None]:
test_date = date(2022, 9, 26)
print(filter_day_by_param(test_date, "type", ["4"]))

# Sorting/grouping filings by CIK, company name, form type, or date
These methods return a dictionary. At the first level, the key `sort_type` tells what column the results have been grouped by. 



```
{
  "sort_type" : "", # Either "cik", "name", "type", or "date"
  "results" : []
}
```
The `results` key contains a list of dictionaries, one formed for each unique value of the "sorting" parameter (CIK, company name, form type, or date) which is found in the IDX file. 
```
{
  "UNIQUE_VAL" : [] # List of filings under that unique value. Dictionaries of the following format:
}
```
The structure of these dictionaries is as follows:
```
{
  "cik" : "CIK_NUM",
  "company" : "COMPANY_NAME",
  "type" : "FILING_TYPE",
  "date" : "YYYY-MM-DD", # For full-index IDX
  "fulltext_path" : ".../edgar/data/CIK/ETC"
}
```



In [None]:
def sort_quarter_by_param(target_year, target_quarter, sort_type):

  master_dict = {}

  # Figure out what type of sort we are doing. This will be our "column" of interest below. 
  sorting_column = 0
  if sort_type == "cik":
    pass
  elif sort_type == "name":
    sorting_column = 1
  elif sort_type == "type":
    sorting_column = 2
  elif sort_type == "date":
    sorting_column = 3
  else:
    print("Invalid sort type: {}".format(sort_type))
    return master_dict

  master_dict["sort_type"] = sort_type
  master_dict["results"] = []

  # Download the IDX content
  qtr_index_url = r"https://www.sec.gov/Archives/edgar/full-index/{}/QTR{}/master.idx".format(target_year, target_quarter)
  base_archives = "https://www.sec.gov/Archives/"
  req_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

  resp = requests.get(url = qtr_index_url, headers = req_headers)
  resp.raise_for_status()

  # Separate the header 
  idx_full_text = resp.text
  split_idx = idx_full_text.split("--------------------\n")

  # Loop through lines of data
  try:
    for line in split_idx[1].splitlines():

      # CIK|Company Name|Form Type|Date Filed|Filename
      # We expect 5 columns
      columns = line.split("|")
      if len(columns) == 5:

        filing_dict = {}
        filing_dict["cik"] = columns[0].zfill(10)
        filing_dict["company"] = columns[1]
        filing_dict["type"] = columns[2]
        filing_dict["date"] = columns[3]
        filing_dict["fulltext_path"] = base_archives + columns[4]
        
        # Append to the list in the existing dictionary if this sorting value has been found before
        for c_item in master_dict["results"]:
          if columns[sorting_column] in c_item.keys():
            c_item[columns[sorting_column]].append(filing_dict)
            break
        
        # Gets executed if break never gets hit
        else:
          # Has not been encountered. Initialize a dictionary for the unique value then append the filing info to it.
          unique_val_dict = { columns[sorting_column] : [] }
          unique_val_dict[columns[sorting_column]].append(filing_dict)
          master_dict["results"].append(unique_val_dict)

  except:
    print("IDX file was in unexpected format, error parsing")

  return master_dict

In [None]:
sort_quarter_by_param("2022", "3", "cik")

In [3]:
# Similar method but uses the daily-index. Takes a date object as a parameter. NOTE: filing_dict objects of this method return date in YYYYMMDD format rather than YYYY-MM-DD
def sort_day_by_param(target_date, sort_type):

  master_dict = {}

  # Figure out what type of sort we are doing. This will be our "column" of interest below. 
  sorting_column = 0
  if sort_type == "cik":
    pass
  elif sort_type == "name":
    sorting_column = 1
  elif sort_type == "type":
    sorting_column = 2
  elif sort_type == "date":
    sorting_column = 3
  else:
    print("Invalid sort type: {}".format(sort_type))
    return master_dict

  master_dict["sort_type"] = sort_type
  master_dict["results"] = []

  # Build path
  daily_index_url = r"https://www.sec.gov/Archives/edgar/daily-index/{}/QTR{}/master.{}{}{}.idx".format(target_date.year, get_quarter_from_date(target_date), target_date.year, \
                                                                                                       str(target_date.month).zfill(2), str(target_date.day).zfill(2)) # master.YYYYMMDD.idx
  base_archives = "https://www.sec.gov/Archives/"
  req_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

  # Get contents
  resp = requests.get(url = daily_index_url, headers = req_headers)
  resp.raise_for_status()

  # Separate the header 
  idx_full_text = resp.text
  split_idx = idx_full_text.split("--------------------\n")

  # Loop through lines of data
  try:
    for line in split_idx[1].splitlines():

      # CIK|Company Name|Form Type|Date Filed|Filename
      # We expect 5 columns
      columns = line.split("|")
      if len(columns) == 5:

        filing_dict = {}
        filing_dict["cik"] = columns[0].zfill(10)
        filing_dict["company"] = columns[1]
        filing_dict["type"] = columns[2]
        filing_dict["date"] = columns[3]
        filing_dict["fulltext_path"] = base_archives + columns[4]
        
        # Append to the list in the existing dictionary if this sorting value has been found before
        for c_item in master_dict["results"]:
          if columns[sorting_column] in c_item.keys():
            c_item[columns[sorting_column]].append(filing_dict)
            break
        
        # Gets executed if break never gets hit
        else:
          # Has not been encountered. Initialize a dictionary for the unique value then append the filing info to it.
          unique_val_dict = { columns[sorting_column] : [] }
          unique_val_dict[columns[sorting_column]].append(filing_dict)
          master_dict["results"].append(unique_val_dict)

  except:
    print("IDX file was in unexpected format, error parsing")

  return master_dict

In [4]:
test_date = date(2022, 9, 26)
print(sort_day_by_param(test_date, "name"))

{'sort_type': 'name', 'results': [{'HENRY SCHEIN INC': [{'cik': '0001000228', 'company': 'HENRY SCHEIN INC', 'type': '4', 'date': '20220926', 'fulltext_path': 'https://www.sec.gov/Archives/edgar/data/1000228/0001209191-22-051256.txt'}]}, {'ROYAL BANK OF CANADA': [{'cik': '0001000275', 'company': 'ROYAL BANK OF CANADA', 'type': '424B2', 'date': '20220926', 'fulltext_path': 'https://www.sec.gov/Archives/edgar/data/1000275/0001140361-22-034664.txt'}, {'cik': '0001000275', 'company': 'ROYAL BANK OF CANADA', 'type': '424B2', 'date': '20220926', 'fulltext_path': 'https://www.sec.gov/Archives/edgar/data/1000275/0001140361-22-034716.txt'}, {'cik': '0001000275', 'company': 'ROYAL BANK OF CANADA', 'type': '424B2', 'date': '20220926', 'fulltext_path': 'https://www.sec.gov/Archives/edgar/data/1000275/0001140361-22-034717.txt'}, {'cik': '0001000275', 'company': 'ROYAL BANK OF CANADA', 'type': '424B2', 'date': '20220926', 'fulltext_path': 'https://www.sec.gov/Archives/edgar/data/1000275/0001140361-2