In [None]:
import requests
import re # Can encounter XML namespaces... use regex
from bs4 import BeautifulSoup

# 13F-HR Parsing
The main information in a 13F holding report is in the information table, SEC reference [here](https://www.sec.gov/about/forms/form13f.pdf). Perhaps grabbing data in relation to "other managers" (combination style report) may also be of use but for now we will simply focus on the positions themselves. See [here](https://www.sec.gov/divisions/investment/13ffaq) for FAQ on 13F filings.

In [None]:
"""
Retrieves positions from modern (AKA those with XML information table) 13F-HR's
Given the URL to the full text filing, returns a list of dictionaries of the following structure:

holding = {
  "issuer" : "APPLE INC",
  "class" : "COM" # Or "SHS CLASS A" etc 
  "cusip" : "CUSIP"
  "amount" : NUM_OF_SECURITY_OWNED,
  "type" : "TYPE_OF_SEC" # "SH", "CALL", or "PUT"
}
"""
def info_table_from_fulltxt(fulltxt_url):

  # List to be returned 
  master_list = []
  
  # Get full text content
  request_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }
  response = requests.get(url = fulltxt_url, headers = request_headers)
  response.raise_for_status()

  # NOTE: Lowercase the text
  fulltxt_content = response.text.lower()

  # Split by "<document>", find the one with "<type>information table"
  split_docs = fulltxt_content.split("<document>")
  if len(split_docs) == 0:
    print("Failed to find a <document> tag in {}. Check file format".format(fulltxt_url))
    return master_list

  for doc in split_docs:
    if "<type>information table" in doc:

      # Jump into XML tag
      split_it = doc.split("<xml>")
      if len(split_it) < 2:
        print("Failed to split information table for {}. Check file format".format(fulltxt_url))
        return []

      # Clean it up 
      stripped_xml = split_it[1].replace("</xml>", "").replace("</text>", "").replace("</document>", "").replace("</sec-document>", "").strip()
      
      # Get root informationtable, holds one infotable child per position/holding
      soup = BeautifulSoup(stripped_xml, features = "xml")
      parent_infotable = soup.find(re.compile("informationtable"))

      if len(parent_infotable) == 0:
        print("Failed to find <informationTable> of {}. Check file format".format(fulltxt_url))
        return master_list

      # Loop through the child infotables
      positions_list = parent_infotable.find_all(re.compile("infotable"))
      if len(positions_list) == 0:
        print("No holdings found in info table of {}".format(fulltxt_url))
        return master_list

      for position_index, position in enumerate(positions_list):

        # Create a dict for the position infotable
        holding = {
            "issuer" : "",
            "class" : "",
            "cusip" : "",
            "amount" : "",
            "type" : ""
        }

        # Fill it
        try:
          holding["issuer"] = position.find(re.compile("nameofissuer")).text.strip()
          holding["class"] = position.find(re.compile("titleofclass")).text.strip()
          holding["cusip"] = position.find(re.compile("cusip")).text.strip()
          holding["amount"] = position.find(re.compile("sshprnamt")).text.strip()
          holding["type"] = position.find(re.compile("sshprnamttype")).text.strip()
        except:
          print("Failed reading values of position #{} in {}. Skipping".format(position_index + 1, fulltxt_url))
          continue
        
        # All caps and append to master
        for key, value in holding.items():
          holding[key] = value.upper()
        master_list.append(holding)

  return master_list

In [None]:
#print(info_table_from_fulltxt("https://www.sec.gov/Archives/edgar/data/1532472/000153247222000004/0001532472-22-000004.txt")) # HAS NAMESPACE
print(info_table_from_fulltxt("https://www.sec.gov/Archives/edgar/data/1715593/000171559322000005/0001715593-22-000005.txt")) # NO NAMESPACE

In [None]:
# TODO: CUSIP lookup via https://www.openfigi.com/api 

In [None]:
# TODO: Methods to compare two 13F-HR's against eachother