In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from pathlib import Path
import re
import pandas as pd
import requests
import json
import time


In [2]:
def get_ticker_information(ticker):
    
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'}

    ticker_information = requests.get(f"https://efts.sec.gov/LATEST/search-index?keysTyped={ticker}", headers=headers)

    response = ticker_information.content.decode()#["hits"]["hits"][0]["_source"]["entity"]
    entity_name = json.loads(response)["hits"]["hits"][0]["_source"]["entity"]
    
    return entity_name

In [460]:
class Consolidated_Schedule_Investments():
    def __init__(self, ticker, entity_name, url):
        self.ticker = ticker
        self.entity_name = entity_name
        self.url = url
    
    def save_File(self, data, filename):
        path = Path(filename)
        df = pd.DataFrame(data)
        df.to_csv(path, index=False)
    
    def get_table_of_url(self, driver):

        urls = []

        table = driver.find_element(By.XPATH, '//*[@id="hits"]/table')
        rows = table.find_elements(By.TAG_NAME, 'tr')
        
        for i in range(1, len(rows)):
            cell = driver.find_element(By.XPATH, f'//*[@id="hits"]/table/tbody/tr[{i}]/td[1]/a')
            data_adsh = cell.get_attribute('data-adsh')
            data_adsh = data_adsh.replace('-','')
            data_file_name = cell.get_attribute('data-file-name')

            url = f"https://www.sec.gov/Archives/edgar/data/{self.ticker}/{data_adsh}/{data_file_name}"
            urls.append(url)
        
        return urls

    
    # Function to extract the target table based on specific heuristics
    def extract_table(self, soup):
        tables = soup.find_all('table')
        candidate_tables = []

        for table in tables:
            # Heuristics to identify the correct table
            # Example: Select tables with more than 2 rows and 2 columns
            rows = table.find_all('tr')
            if len(rows) > 10:
                i = 0
                while True:
                    if rows[i].text != '\n\n\n\n':
                        cols = rows[i].find_all(['td'])
                        break

                    i +=1
                # if rows[5].text != '\n\n\n\n':
                #     cols = rows[5].find_all(['td'])
                # else:
                #     cols = rows[4].find_all(['td'])
                # print(repr(rows[5].text))
                # if len(cols) > 12:
                if "Initial" in rows[1].text:
                    candidate_tables.append(table)
        
        # If multiple tables match, refine selection logic
        if len(candidate_tables) > 1:
            # Example: Further refine based on specific row or column content
            choices = ["Co-Investments", "Primary Private Investment Funds", "Secondary Private Investment Funds"]
            for table in candidate_tables:
                if any(x in str(table) for x in choices):
                    continue
                else:
                    remove_idx = candidate_tables.index(table)
                    candidate_tables.pop(remove_idx)
            return candidate_tables  # Fallback to the first candidate

        return None
    
    def amg_pantheon_fund(self):
        data = {
            "type": [],
            "security": [],
            "initial_acquisition_date": [],
            "shares": [],
            "value": [],
            "percent_of_net_assets": []
        }

        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
        driver.implicitly_wait(5)
        driver.maximize_window()
        driver.get(self.url)
        time.sleep(5)

        

        urls = self.get_table_of_url(driver)
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'}
        
        while True:
            r = requests.get(urls[5], headers=headers)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'html.parser')
                break

        target_table = self.extract_table(soup)

        for table in target_table:
            if target_table:
                # with open("EDGAR_WS/ouput.txt", "w", encoding="utf-8") as txt:
                texts = table.text
                texts = texts.replace('\xa0', '')
                texts = texts.replace('\u2003', '')
                texts = texts.strip('\n\t ')
                texts = texts.replace('\n \n', '\n\n')
                texts = texts.replace('Initial\nAcquisition', 'Initial Acquisition')
                texts = texts.replace('PercentofNet\nAssets', 'Percent of Net Assets')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\n')
                # texts = re.sub(r'(?<=%)\n+(?=\$)', '\t', texts)
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n\n', '\t\t')
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n\n', '\t\t\t\t\n')
                # texts = re.sub(r'(?<=[a-zA-Z])\n+(?=[0-9])', '\t', texts)
                # texts = re.sub(r'Net Assets\n+', 'Net Assets\t\t', texts)
                # texts = re.sub(r'Other Assets, less Liabilities\n+', 'Other Assets, less Liabilities\t\t', texts)
                # texts = texts.replace('\n\n\n\n\n\n\n\n\n', ' \n')
                # texts = texts.replace('\n\n\n\n\n\n\n\n', '\n')
                # texts = re.sub(r'\n+(?=\d)', '\t', texts)
                # texts = texts.replace('\n\n\n\n\n\n\n', '\n')
                # texts = texts.replace('\n\n\n\n\n\n', '\n')
                # texts = texts.replace('\n\n\n\n', '\t')
                # texts = texts.replace("\n\n\n Net Assets", "\n Net Assets")
                # texts = texts.replace("\n\n Net Assets", "\n Net Assets")
                # texts = texts.replace('\n\n\n', '\t')
                # texts = texts.replace('\n\nSe', '\n')
                # texts = texts.replace('\n\n', '\t')
                # texts = re.sub(r'\n%', '%', texts)
                # texts = re.sub(r'(\S)\n(\S)', r'\1 \2', texts)
                # texts = re.sub(r'(\S)\n(\s*\()', r'\1 \2', texts)
                # texts = re.sub(r'(?<=\S)\n(?=\d)|(?<=\d)\n(?=\S)', '', texts)
                # texts = re.sub(r'^[ \t]+(?=\S)', '', texts, flags=re.MULTILINE)
                # texts = re.sub(r'\t{5,} ?', '\n', texts)
                texts = texts.replace('\t Total', '\n Total')
                print(repr(f"\n----------------------------------------------------------------------------------\n{texts}"))
                # print(f"\n----------------------------------------------------------------------------------\n{texts}")
                # print(type(table))

        # for file in urls:

        #     driver.get(file)

        #     first_table = driver.find_element(By.XPATH, "/html/body/document/type/sequence/filename/description/text/table[120]")
        #     rows = first_table.find_elements(By.CSS_SELECTOR, 'tr[style*="page-break-inside:avoid"][style*="font-family:ARIAL"][style*="font-size:8pt"]')
        #     print(rows)

In [461]:
ticker = 1609211
entity = get_ticker_information(1609211)
url = f"https://www.sec.gov/edgar/search/#/category=custom&ciks=000{ticker}&entityName={entity}(CIK 000{ticker})&forms=N-CSR,N-CSRS"
# print(url)


Consolidated_Schedule_Investments(ticker, entity, url).amg_pantheon_fund()

'\n----------------------------------------------------------------------------------\nInitialAcquisitionDate\n\n\nShares\n\n\nValue\n\n\n\n\n\n\n\n Co-Investments - 44.6%\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n ACOF IV ATD Co-Invest LP (Consumer Discretionary)(a),* \n\n\n02/27/2015\n\n\n\n(c)\n\n\n\n$0\n\n\n\n\n\n\n\n AIX Pride Syndication L.P. (Information\nTechnology)(a),* \n\n\n11/16/2018\n\n\n\n(c)\n\n\n\n16,739,220\n\n\n\n\n\n\n\n AP VIII Prime Security Services Holdings, L.P. (Industrials)* \n\n\n04/26/2016\n\n\n\n(c)\n\n\n\n1,872,916\n\n\n\n\n\n\n\n APH CUBS Co-invest LP (Financials)(a),* \n\n\n11/16/2018\n\n\n\n(c)\n\n\n\n6,734,432\n\n\n\n\n\n\n\n APIA Destiny-B\nFollow-on FPCI (Communication Services) (Belgium)(a),(b),(d),* \n\n\n09/16/2021\n\n\n\n(c)\n\n\n\n9,277,163\n\n\n\n\n\n\n\n APIA Graitec FPCI (Information Technology)\n(France)(a),* \n\n\n02/25/2020\n\n\n\n(c)\n\n\n\n5,881,091\n\n\n\n\n\n\n\n APIA Odigo FPCI (Information Technology)\n(France)(a),(b),* \n\n\n12/22/2020\n