In [None]:
from bs4 import BeautifulSoup
from datetime import date
import os
import pandas as pd
import requests
import re
import time
import traceback
    
def get_submission_url_data(CIK_num):
    try:
        # Search for the company with CIK number CIK_num
        search_results_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + \
            CIK_num + "&type=SC+TO-I&dateb=&owner=exclude&count=100"
        result_site = requests.get(search_results_url)
        data = result_site.text
        EDGAR_results_page = BeautifulSoup(data, 'html.parser')

        # Get list of tags with the string \"SC TO-I\", this signals a tender offer document.\n",
        sc_to_i_tag_list = EDGAR_results_page.find_all("td", string="SC TO-I")
        # If no tender offer tags found, quit now
        if len(sc_to_i_tag_list) < 1:
            return None

    #     # We get the dates of the tender offer documents
    #     sc_to_i_dates = []
    #     for sc_to_i_tag in sc_to_i_tag_list:
    #         row_cells = sc_to_i_tag.find_next_siblings("td")
    #         # The date is the 3rd td after our tag
    #         date = pd.to_datetime(row_cells[2].get_text(), format='%Y-%m-%d', errors='ignore')
    #         sc_to_i_dates.append(date)
    #         sc_to_i_tags = pd.DataFrame(data={"tags":sc_to_i_tag_list, "dates":sc_to_i_dates})

        final_dates = []
        final_urls = []
        for tag in sc_to_i_tag_list:
            # Get the url to the details page
            filing_details_url = tag.find_next("a")['href']  # extension for the document
            detail_url = "https://www.sec.gov/" + filing_details_url
            print(detail_url)

            # Get the html of the details page
            details_site = requests.get(detail_url)
            details_site_data = details_site.text
            FILING_detail_page = BeautifulSoup(details_site_data, 'html.parser')

            # Get the tender offer/tender ammendment 
            tender_document_tag = FILING_detail_page.find('td', string="Complete submission text file")
            tender_document_url = "https://www.sec.gov/" + tender_document_tag.find_next("a")['href']
            form = requests.get(tender_document_url)

            # If this tender offer has no odd lot provision, then skip this iteration and do
            # not add it
            if re.search(r'(odd lot)', form.text, re.IGNORECASE) is None:
                continue

            # Get date of tender offer
            row_cells = tag.find_next_siblings("td")
            date = pd.to_datetime(row_cells[2].get_text(), format='%Y-%m-%d', errors='ignore').date()

            # Add the information
            final_dates.append(date)
            final_urls.append(detail_url)
            
        if len(final_urls) < 1:
            return None
        # Return the information as a dataframe
        return pd.DataFrame({'cik': '"' + str(CIK_num) + '"', 'date': final_dates, 'url': final_urls})
    except:
        print("Exception occured while attempting to produce submission text file, returning None")
        traceback.print_exc()
        return None

def main():
    print("Beginning running")
    with open("CIK_Numbers.txt", "r") as cik_numbers_file: 
        cik_numbers = cik_numbers_file.readlines()
        print(len(cik_numbers))
        # May need to go through 100000 to 200000 again
        cik_numbers = cik_numbers[300000:400000]
        url_data = pd.DataFrame(columns={'cik': [], 'date': [], 'url': []})
        try:
            for cik_number in cik_numbers:
                df = get_submission_url_data(cik_number.rstrip())
                if df is not None and not df.empty:
                    url_data = url_data.append(df, ignore_index=True)
            print(url_data)
            url_data.to_csv("url_data_4.csv", index=False)
        except:
            # If there is an exception, save what we have
            print("Exception occured. Writing now to not lose data")
            url_data.to_csv("url_data_4.csv", index=False)
            traceback.print_exc()
            
if __name__ == "__main__": 
    main()

Beginning running
708280
https://www.sec.gov//Archives/edgar/data/1325483/000095012310088020/0000950123-10-088020-index.htm
https://www.sec.gov//Archives/edgar/data/1661458/000161577417000325/0001615774-17-000325-index.htm
