In [1]:
from IPython.display import display, HTML
from bs4 import BeautifulSoup
from urllib import parse
from time import sleep
from datetime import datetime
from multiprocessing.pool import ThreadPool
import pandas as pd
import requests

In [2]:
SYMBOL = "AMZN"

INITIAL_URL = "https://www.nasdaq.com/symbol/{}/institutional-holdings".format(SYMBOL.lower())

In [3]:
def get_page_soup_object(url, key="id", value=None, sub_object_name=""):
    """
    a helper function that loads and parses html pages
    sometimes we get blocked or other errors, so we will check if elements we need are on the page
    """
    retries, max_retries = 0, 4
    while retries < max_retries:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # check elements on the page
        element = soup.find(**{key: value})
        
        if element is None or sub_object_name and getattr(element, sub_object_name) is None:
            print("{}:{}.{} isn't on the page".format(key, value, sub_object_name))
            retries += 1
            sleep(retries * 2)  
        else:
            if sub_object_name:
                return getattr(element, sub_object_name)
            return element
    else:
        raise Exception("Cannot access data on the page. You must've been blocked.".format())

### 1. Let's start from the holders page

<img src="n18/1.png" width=800 />

In [4]:
# knowing the stucture, we can implement function that returns all the funders with their names, links and shares

def get_holders(holders_url):
    """
    a generator function
    returns: holder name, holder link, shares number
    """
    holders_block_table = get_page_soup_object(holders_url, "id", "quotes_content_left_pnlInsider", "table")
    for tr in holders_block_table.find_all("tr"): # iterate through the rows
        cells = [el for el in tr.children if el.name is not None]  # there non-tags(like \n) we need to filter out
        name_cell, date_cell, shares_cell, *_, value_cell = cells  # unpacking cell into distinct variables

        if name_cell.name == "th":
            continue  # it's the header, we don't need to proceed any further
        
        data = dict(
            Name=name_cell.a.string,
            DetailURL=name_cell.a["href"],
            SharesHeld=None if shares_cell.string is None else int(shares_cell.string.replace(",", "")),
            Value=None if value_cell.string is None else int(value_cell.string.replace(",", ""))
        )
        yield data

In [5]:
for e in get_holders(INITIAL_URL):
    print(e)

{'Value': 53163944, 'Name': 'VANGUARD GROUP INC', 'SharesHeld': 28527398, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/vanguard-group-inc-61322'}
{'Value': 46976492, 'Name': 'BLACKROCK INC.', 'SharesHeld': 25207255, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/blackrock-inc-711679'}
{'Value': 31912132, 'Name': 'FMR LLC', 'SharesHeld': 17123825, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/fmr-llc-12407'}
{'Value': 28417520, 'Name': 'PRICE T ROWE ASSOCIATES INC /MD/', 'SharesHeld': 15248641, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/price-t-rowe-associates-inc-md-2145'}
{'Value': 28220031, 'Name': 'STATE STREET CORP', 'SharesHeld': 15142670, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/state-street-corp-6697'}
{'Value': 18312482, 'Name': 'CAPITAL WORLD INVESTORS', 'SharesHeld': 9826349, 'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/capital-world-inv

### 2. Details page

The stats can be get from the table with id = "position-stats" <img src="n18/3.png"  width=600 />

In [6]:
def get_position_stats(url):
    """
    returns: total positions and total value
    """
    stats_table = get_page_soup_object(url, "id", "position-stats")
    
    rows = [el for el in stats_table.children if el.name == "tr"]
    first_cell, *_, last_cell = rows
    
    positions_string = first_cell.find(align="right").string
    value_string = last_cell.find(align="right").string
    
    positions = None if positions_string is None else int(positions_string.replace(",", ""))
    value = None if value_string is None else int(value_string.replace(",", ""))
    
    return positions, value
    

get_position_stats("https://www.nasdaq.com/quotes/institutional-portfolio/vanguard-group-inc-61322")

(4080, 2444936)

### There are about 160 pages * (15 + 1 links)  = 2560 requests, we have to run them in parallel

In [7]:
# we need a single function to run
def get_holders_data(url):
    results = []
    for data in get_holders(url):
        positions, total_value = get_position_stats(data["DetailURL"])
        data.update(
            TotalPositions=positions,
            TotalMktValue=total_value,
        )
        results.append(data)
    
    return results

We can use last link from the pager to compose all the pages links. 

<img src="n18/2.png" width=700 >

In [8]:
last_link = get_page_soup_object(INITIAL_URL, "id", "quotes_content_left_lb_LastPage")
parse_result = parse.urlparse(last_link["href"])
query_params = dict(parse.parse_qsl(parse_result.query))
query_params

{'page': '161'}

In [9]:
pool = ThreadPool(20)
all_pages_links = ("{}?page={}".format(INITIAL_URL, num) for num in range(1, int(query_params.get("page", 1)) + 1))
results = pool.map(get_holders_data, all_pages_links)

results[0][0]

id:quotes_content_left_pnlInsider.table isn't on the page
id:quotes_content_left_pnlInsider.table isn't on the page
id:position-stats. isn't on the page
id:position-stats. isn't on the page
id:position-stats. isn't on the page
id:quotes_content_left_pnlInsider.table isn't on the page
id:position-stats. isn't on the page
id:position-stats. isn't on the page


{'DetailURL': 'https://www.nasdaq.com/quotes/institutional-portfolio/vanguard-group-inc-61322',
 'Name': 'VANGUARD GROUP INC',
 'SharesHeld': 28527398,
 'TotalMktValue': 2444936,
 'TotalPositions': 4080,
 'Value': 53163944}

In [10]:
df = pd.DataFrame.from_records([row for page in results for row in page])
df

Unnamed: 0,DetailURL,Name,SharesHeld,TotalMktValue,TotalPositions,Value
0,https://www.nasdaq.com/quotes/institutional-po...,VANGUARD GROUP INC,28527398,2444936.0,4080,53163944.0
1,https://www.nasdaq.com/quotes/institutional-po...,BLACKROCK INC.,25207255,2225061.0,4814,46976492.0
2,https://www.nasdaq.com/quotes/institutional-po...,FMR LLC,17123825,916794.0,2659,31912132.0
3,https://www.nasdaq.com/quotes/institutional-po...,PRICE T ROWE ASSOCIATES INC /MD/,15248641,682091.0,2569,28417520.0
4,https://www.nasdaq.com/quotes/institutional-po...,STATE STREET CORP,15142670,1250094.0,3671,28220031.0
5,https://www.nasdaq.com/quotes/institutional-po...,CAPITAL WORLD INVESTORS,9826349,480927.0,538,18312482.0
6,https://www.nasdaq.com/quotes/institutional-po...,CAPITAL RESEARCH GLOBAL INVESTORS,8131258,380344.0,463,15153494.0
7,https://www.nasdaq.com/quotes/institutional-po...,BAILLIE GIFFORD & CO,5427762,98159.0,208,10115232.0
8,https://www.nasdaq.com/quotes/institutional-po...,INVESCO LTD.,5138869,285561.0,3649,9576848.0
9,https://www.nasdaq.com/quotes/institutional-po...,NORTHERN TRUST CORP,4625889,411716.0,4248,8620853.0


In [11]:
df["RelativePosition"] = df.Value / df.TotalMktValue / 1000 # we  add "/1000", cos total value in millons and value in 1,000s
df = df.sort_values(by="RelativePosition", ascending=False)
pd.set_option('display.max_rows', None) 
df[["Name", "SharesHeld", "TotalPositions", "TotalMktValue", "RelativePosition"]]

Unnamed: 0,Name,SharesHeld,TotalPositions,TotalMktValue,RelativePosition
970,DGH INVESTMENTS INC.,4150,2,8.0,0.96675
74,"URSA FUND MANAGEMENT, LLC",557200,14,1304.0,0.7963213
392,"WORM CAPITAL, LLC",34110,4,133.0,0.4779549
278,ARROWGRASS CAPITAL PARTNERS LLP,65200,10,257.0,0.4727899
203,SUMWAY DEVELOPMENT LTD.,116191,12,585.0,0.3701453
237,HUNTINGTON STEELE LLC,83615,74,444.0,0.3509595
480,SOCIAL CAPITAL PEP MANAGEMENT LLC,22211,6,122.0,0.3392869
381,YONG RONG (HK) ASSET MANAGEMENT LTD,35500,15,200.0,0.33079
507,"JW ASSET MANAGEMENT, LLC",19627,24,127.0,0.2880079
578,ANTIPODEAN ADVISORS LLC,15100,19,106.0,0.2654811
