In [64]:
import requests
import bs4
import pandas as pd
import numpy as np

In [65]:
# Prepare the soup
def get_soup(url):
    
    # Important for passing multiple links at a time
    if type(url) != list:
        url = list(url)
    
    soup = []
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
    headers = {'User-Agent': user_agent}
    
    for url_ in url:
        res = requests.get(url_, headers=headers)
        soup.append(bs4.BeautifulSoup(res.text,'html.parser'))
    
    return soup

In [66]:
# Each report entry takes you to a page with multiple links to the report
#   formatted in a couple different ways - we're going to use html
# NOTE: This only goes through the first page of 40 reports

# Get all links to HTML-formatted report overview page
def get_html_links(url):
    
    soup = get_soup(url)
    html_links = []
    
    for link in soup[0].find_all('a'):
        if '[html]' in link.contents:
            html_links.append('https://www.sec.gov/'+link.attrs['href'])
    
    return html_links


page1 = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent'
html_links = get_html_links(page1)

In [67]:
# Get all links to the targeted HTML-formatted reports
def get_target_links(html_links):
    
    target_links = []
    
    soups = get_soup(html_links)
    
    for soup in soups:
        for link in soup.find_all('a'):
            # print(link.contents)
            if '.html' in link.contents[0]:
                target_links.append('https://www.sec.gov' + link.attrs['href'])
    
    return target_links

target_links = get_target_links(html_links)

In [68]:
# Turns html table into pandas dataframe
def pd_table(table):
    return pd.read_html(table.prettify())

# Gets all the elements with a table tag
def find_table(soup):
    return soup.find_all('table')

In [69]:
# Functions for isolating & slicing parts of the table
# The slicing for 'ticker' and 'issuer' isn't the best
#   and doesn't always include everything but it gets the
#   the job done most of the time
def get_trader(table):
    return table[1][0][0]

def get_ticker(table):
    return table[0][1][0][-7:-3].strip()

def get_issuer(table):
    return table[0][1][0][47:-12]

def get_txn_date(table):
    return table[0][1][1][-10:]

In [70]:
# Gets the relationship to issuing entity - e.g. CEO, 10% Owner
# This is the reason the person has to file the report
def get_relationship(table):
    
    relationship = set()
    table = table.values.tolist()
    
    for row in table:
        for value in row:
            # This takes care of some null and erroneous values
            if (value != 'X') and (value != 'Other (specify below)') and (type(value) != float):
                if value == 'Officer (give title below)':
                    relationship.add('Officer')
                else:
                    relationship.add(value)
    
    return relationship

In [71]:
# Main function that gets the parts of the report that we're interested in
#   and builds a data dict from them

# NOTE: Some of the reports look like they are duplicated, but I think
#         these might just be instances of the filer submitting multiple reports

# NOTE: So far this only handles the identifying information about the
#         filer and report from the first table.
#       The next step is to start breaking down the
#       'Table I - Non-Derivative Securities Acquired, Disposed of...' table
#         to get all of the good stuff
def get_content(target_links):
    
    data = {}
    report_id = 0
    
    soups = get_soup(target_links)
    
    for soup in soups:
        tables = find_table(soup)
        
        # The tables in some reports seem to be formatted in a slightly different way
        # It's about 50/50 for the first page of 40 reports
        # Report type1 has 16 tables, type2 has 15 tables
        # Not sure if this is an error with the code or if it has to do with
        #   the way that the reports are submitted
        if len(tables) == 16:
            table1 = pd_table(tables[4])
            
            trader = get_trader(table1)
            issuer = get_issuer(table1)
            ticker = get_ticker(table1)
            txn_date = get_txn_date(table1)
            relationship = get_relationship(table1[6])
        
        # Haven't gotten around to implementing this part for report type2 yet
        # It will probably similar to type1 but with different sub-tables
        #   to grab the data from
        elif len(tables) == 15:
            # table1 = pd_table(tables[4])
            # trader = table1[0][0][0]
            continue
        
        data[report_id] = {
            'Trader': trader,
            'Issuer': issuer,
            'Ticker': ticker,
            'Earlist_Transaction': txn_date,
            'Relationship': relationship
        }
        
        report_id += 1
        
    return data

data = get_content(target_links)
data

{0: {'Trader': 'Fujimoto Michael K',
  'Issuer': 'FIRST HAWAIIAN, INC',
  'Ticker': 'FHB',
  'Earlist_Transaction': '10/13/2022',
  'Relationship': {'10% Owner', 'Director', 'Officer'}},
 1: {'Trader': 'Fujimoto Michael K',
  'Issuer': 'FIRST HAWAIIAN, INC',
  'Ticker': 'FHB',
  'Earlist_Transaction': '10/13/2022',
  'Relationship': {'10% Owner', 'Director', 'Officer'}},
 2: {'Trader': 'Podbere Burt W.',
  'Issuer': 'CrowdStrike Holdings, Inc.',
  'Ticker': 'CRWD',
  'Earlist_Transaction': '10/13/2022',
  'Relationship': {'10% Owner',
   'CHIEF FINANCIAL OFFICER',
   'Director',
   'Officer'}},
 3: {'Trader': 'Podbere Burt W.',
  'Issuer': 'CrowdStrike Holdings, Inc.',
  'Ticker': 'CRWD',
  'Earlist_Transaction': '10/13/2022',
  'Relationship': {'10% Owner',
   'CHIEF FINANCIAL OFFICER',
   'Director',
   'Officer'}},
 4: {'Trader': 'Kurtz George',
  'Issuer': 'CrowdStrike Holdings, Inc.',
  'Ticker': 'CRWD',
  'Earlist_Transaction': '10/13/2022',
  'Relationship': {'10% Owner', 'Direc

In [53]:
# I'm using this block to see the tables one at a time
#   and to find out how to grab the data that we want

soups = get_soup(target_links)

# This is the indentifying information table
# table1 = pd_table(find_table(soups[5])[4])
# table1

# This is the next table we're interested in
table2 = pd_table(find_table(soups[5])[12])[0]
table2

Unnamed: 0_level_0,"Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned","Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned"
Unnamed: 0_level_1,1. Title of Security (Instr. 3),2. Transaction Date (Month/Day/Year),"2A. Deemed Execution Date, if any (Month/Day/Year)",3. Transaction Code (Instr. 8),3. Transaction Code (Instr. 8),"4. Securities Acquired (A) or Disposed Of (D) (Instr. 3, 4 and 5)","4. Securities Acquired (A) or Disposed Of (D) (Instr. 3, 4 and 5)","4. Securities Acquired (A) or Disposed Of (D) (Instr. 3, 4 and 5)",5. Amount of Securities Beneficially Owned Following Reported Transaction(s) (Instr. 3 and 4),6. Ownership Form: Direct (D) or Indirect (I) (Instr. 4),7. Nature of Indirect Beneficial Ownership (Instr. 4)
Unnamed: 0_level_2,1. Title of Security (Instr. 3),2. Transaction Date (Month/Day/Year),"2A. Deemed Execution Date, if any (Month/Day/Year)",Code,V,Amount,(A) or (D),Price,5. Amount of Securities Beneficially Owned Following Reported Transaction(s) (Instr. 3 and 4),6. Ownership Form: Direct (D) or Indirect (I) (Instr. 4),7. Nature of Indirect Beneficial Ownership (Instr. 4)
0,Class A common stock,10/13/2022,,C,,250000.0,A,(1),250000,I,Allegra Kurtz Irrevocable Gift Trust (2)
1,Class A common stock,10/13/2022,,C,,250000.0,A,(1),250000,I,Alexander Kurtz Irrevocable Gift Trust (2)
2,Class A common stock,10/13/2022,,G (3),V,250000.0,D,$ 0,0,I,Allegra Kurtz Irrevocable Gift Trust (2)
3,Class A common stock,10/13/2022,,G (3),V,250000.0,D,$ 0,0,I,Alexander Kurtz Irrevocable Gift Trust (2)
4,Class A common stock,,,,,,,,"943,947 (3)",D,
