In [2]:
import requests
import re

# Define the base URL
base_url = r"https://www.sec.gov/Archives/edgar/data"

# Define the CIK number for Goldman Sachs
cik_num = '886982'

# Define the headers to mimic a user-agent
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
}

# Get the filings in JSON format
filings_url = "{}/{}/index.json".format(base_url, cik_num)
content = requests.get(filings_url, headers=HEADERS)
decoded_content = content.json()

# Define the desired form types
desired_form_types = ['10-K', '10-Q']

# Iterate through the filings
for filing_number in decoded_content['directory']['item']:
    filing_num = filing_number['name']
    print('-' * 100)
    print('Grabbing filing : {}'.format(filing_num))
    
    # Define the filing URL in JSON format
    filing_url = "{}/{}/{}/index.json".format(base_url, cik_num, filing_num)

    # Get the documents submitted for that filing
    content = requests.get(filing_url, headers=HEADERS)
    document_content = content.json()

    # Iterate through the documents in the filing
    for document in document_content['directory']['item']:
        document_name = document['name']

        # Check if the document is a ".txt" file
        if document_name.endswith('.txt'):
            # Define the URL of the ".txt" file
            txt_file_url = "{}/{}/{}/{}".format(base_url, cik_num, filing_num, document_name)
            print('Downloading .txt file from: {}'.format(txt_file_url))

            # Get the contents of the ".txt" file
            txt_file_content = requests.get(txt_file_url, headers=HEADERS).text

            # Initialize variables to store extracted information
            date = None
            filing_number = None
            form_type = None

            # Use regular expressions to extract the information
            match_date = re.search(r'(\d{8})\.txt', document_name)
            if match_date:
                date = match_date.group(1)

            # Search for lines starting with "FORM TYPE:" and capture the following text
            match_form_type = re.search(r'FORM TYPE:[^\n]*\n([^\n]*)', txt_file_content)
            if match_form_type:
                form_type = match_form_type.group(1).strip()

            match_accession_number = re.search(r'ACCESSION NUMBER:[\s\t]*(\S+)', txt_file_content)
            if match_accession_number:
                filing_number = match_accession_number.group(1)

            # Check if the extracted form type is in the desired form types
            if form_type in desired_form_types:
                print(f"Date: {date}")
                print(f"Filing Number: {filing_number}")
                print(f"Form Type: {form_type}")
                # Download the desired .txt file
                with open(f"desired_file.txt", "w") as txt_file:
                    txt_file.write(txt_file_content)
                # Exit the loop
                break
    else:
        continue
    break  # Exit the outer loop once a desired form type is found

----------------------------------------------------------------------------------------------------
Grabbing filing : 000095017023052955
Downloading .txt file from: https://www.sec.gov/Archives/edgar/data/886982/000095017023052955/0000950170-23-052955.txt
----------------------------------------------------------------------------------------------------
Grabbing filing : 000095017023052954
Downloading .txt file from: https://www.sec.gov/Archives/edgar/data/886982/000095017023052954/0000950170-23-052954.txt
----------------------------------------------------------------------------------------------------
Grabbing filing : 000095017023052945
Downloading .txt file from: https://www.sec.gov/Archives/edgar/data/886982/000095017023052945/0000950170-23-052945.txt
----------------------------------------------------------------------------------------------------
Grabbing filing : 000095017023052943
Downloading .txt file from: https://www.sec.gov/Archives/edgar/data/886982/0000950170230529

KeyboardInterrupt: 