# Doawnloads and Stores IPO S-1 Filings as .txt Files

* Uses EDGAR search
 * https://www.sec.gov/edgar/searchedgar/companysearch.html
* Uses NASDAQ IPO lists
 * https://www.nasdaq.com/markets/ipos/
* Skips ambiguous company names (do manually)
* Takes largest file if multiple files are available (S-1, S-1/A etc.)

In [1]:
import edgar

import pandas as pd

from pathlib import Path

import datetime
from datetime import timedelta
from collections import OrderedDict

In [2]:
# params
date_range = ["2018-03-01", datetime.datetime.today().strftime('%Y-%m-%d')]
start, end = [datetime.datetime.strptime(_, "%Y-%m-%d") for _ in date_range]
date_dict = OrderedDict(((start + timedelta(_)).strftime(r"%Y-%m"), None) for _ in range((end - start).days)).keys()
print('date_dict:', date_dict)

date_dict: odict_keys(['2018-03', '2018-04', '2018-05', '2018-06'])


### Scrape NASDAQ IPO Lists

In [3]:
df_symbols = pd.DataFrame()

for x in date_dict:
    df_symbols = df_symbols.append(pd.read_html('https://www.nasdaq.com/markets/ipos/activity.aspx?tab=pricings&month=' + x)[0], ignore_index=True)
    
df_symbols.index = df_symbols['Symbol']    

In [4]:
df_symbols.head(3)

Unnamed: 0_level_0,Company Name,Symbol,Market,Price,Shares,Offer Amount,Date Priced
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GNPX,"GENPREX, INC.",GNPX,NASDAQ Capital,$5,1280000,"$6,400,000",3/29/2018
IQ,"IQIYI, INC.",IQ,NASDAQ Global,$18,125000000,"$2,250,000,000",3/29/2018
UMRX,UNUM THERAPEUTICS INC.,UMRX,NASDAQ Global Select,$12,5770000,"$69,240,000",3/29/2018


### Batch Download

In [None]:
#batch download
counter = 0

for x in df_symbols['Symbol']:
    counter += 1
    print('\n( ' + str(counter) + ' / ' + str(df_symbols.shape[0]) + ' ) ' + x)
    
    #check if exists
    if Path("./Data/" + x + ".htm").is_file():
        print(x + ' data already exists, skipping...')
        continue
    # file exists
    
    #create company
    tmpEdgar = edgar.Edgar()
    possible_companies = tmpEdgar.findCompanyName(df_symbols.loc[x]['Company Name'])
    print('possible_companies:', possible_companies)
    
    #validate
    if len(possible_companies) == 0:
        print('no possible companies:', x)
        continue
    
    name = possible_companies[0]
    cik = tmpEdgar.getCikByCompanyName(possible_companies[0])
    company = edgar.Company(name, cik)
    
    #look for S-1
    tree = company.getAllFilings(filingType = "S-1")
    docs = edgar.getDocuments(tree, noOfDocuments=5)
    
    #write file
    if len(docs) > 0:
        with open("./Data/" + x + ".htm", "w", encoding="utf-8") as f:
            #if multiple amendments exist, take the first one with largest size
            if len(docs) > 1:
                l = [len(x) for x in docs]
                if l[1] / 2 > l[0]:
                    f.write(docs[1].decode("utf-8"))
                else:
                    f.write(docs[0].decode("utf-8"))
            else:
                #just write first
                f.write(docs[0].decode("utf-8"))
            
            print('Got data for ' + x)


( 1 / 66 ) GNPX
GNPX data already exists, skipping...

( 2 / 66 ) IQ
possible_companies: ['IQIYI, INC.']

( 3 / 66 ) UMRX
UMRX data already exists, skipping...

( 4 / 66 ) ONE


In [13]:
import urllib.request
from bs4 import BeautifulSoup
 
response = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/1467623/000119312518089786/d553522ds1a.htm')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)
 
print(text)

S-1/A1d553522ds1a.htmS-1/AS-1/ATable of ContentsAs filed with the Securities and Exchange Commission on March 21, 2018Registration No. 333-223182UNITED STATESSECURITIES
AND EXCHANGE COMMISSIONWashington, D.C. 20549AMENDMENT
NO. 2TOFORMS-1REGISTRATION STATEMENTUnderThe
Securities Act of 1933Dropbox, Inc.(Exact name
of Registrant as specified in its charter)Delaware737226-0138832(State or other jurisdiction ofincorporation or organization)(Primary Standard IndustrialClassification Code Number)(I.R.S. EmployerIdentification Number)Dropbox, Inc.333 Brannan
StreetSan Francisco, California 94107(415) 857-6800(Address,
including zip code, and telephone number, including area code, of Registrant’s principal executive offices)Andrew W.
HoustonChief Executive OfficerDropbox, Inc.333 Brannan
StreetSan Francisco, California 94107(415) 857-6800(Name,
address, including zip code, and telephone number, including area code, of agent for service)Copies to:Tony Jeffries, Esq.Rezwan D. Pavri, Esq.Lisa
L.