# Doawnloads and Stores IPO S-1 Filings as .txt Files

* Uses EDGAR search
 * https://www.sec.gov/edgar/searchedgar/companysearch.html
* Uses NASDAQ IPO lists
 * https://www.nasdaq.com/markets/ipos/
* Skips ambiguous company names (do manually)
* Takes latest file if multiple files are available (S-1, S-1/A etc.)

In [12]:
import edgar
import nasdaq

from pathlib import Path
import datetime

### Scrape NASDAQ IPO Lists

In [13]:
df_symbols = nasdaq.get_ipo_list('2018-06-01')

date range: odict_keys(['2018-06', '2018-07'])


In [14]:
df_symbols.head(3)

Unnamed: 0_level_0,Company Name,Symbol,Market,Price,Shares,Offer Amount,Date Priced
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DOMO,"DOMO, INC.",DOMO,NASDAQ Global,21.0,9200000,193200000.0,2018-06-29
BJ,"BJ'S WHOLESALE CLUB HOLDINGS, INC.",BJ,NYSE,17.0,37500000,637500000.0,2018-06-28
BV,"BRIGHTVIEW HOLDINGS, INC.",BV,NYSE,22.0,21300000,468600000.0,2018-06-28


### Batch Download

In [15]:
#batch download
counter = 0

for x in df_symbols['Symbol']:
    try:
        counter += 1
        print('\n( ' + str(counter) + ' / ' + str(df_symbols.shape[0]) + ' ) ' + str(x))

        #check if exists
        if Path("../Data/" + x + ".htm").is_file():
            print(x + ' data already exists, skipping...')
            continue

        #create company
        tmpEdgar = edgar.Edgar()
        possible_companies = tmpEdgar.findCompanyName(df_symbols.loc[x]['Company Name'])
        print('possible_companies:', possible_companies)

        #validate
        if len(possible_companies) == 0:
            print('no possible companies:', x)
            continue

        name = possible_companies[0]
        cik = tmpEdgar.getCikByCompanyName(possible_companies[0])
        company = edgar.Company(name, cik)

        #look for S-1
        tree = company.getAllFilings(filingType = "S-1")
        docs = edgar.getDocuments(tree, noOfDocuments=5)

        #if multiple amendments exist
        #filter ones with length lower than 500K (statistically the mean is higher)
        docs = [d for d in docs if len(d) > 500000]

        #write file
        if len(docs) > 0:
            with open("../Data/" + x + ".htm", "w", encoding="utf-8") as f:
                f.write(docs[0].decode("utf-8"))
                print('Got data for ' + x + ' chose ' + str(len(docs[0])))
    except Exception as e:
        print(x, e)


( 1 / 39 ) DOMO
DOMO data already exists, skipping...

( 2 / 39 ) BJ
BJ data already exists, skipping...

( 3 / 39 ) BV
BV data already exists, skipping...

( 4 / 39 ) ENTX
possible_companies: ['ENTERA BIO LTD.']

( 5 / 39 ) EVER
EVER data already exists, skipping...

( 6 / 39 ) HCCHU
HCCHU data already exists, skipping...

( 7 / 39 ) FTSV
FTSV data already exists, skipping...

( 8 / 39 ) NFC'U
possible_companies: ['FRONTIER OIL CORP /NEW/', 'NEW FRONTIER BANCORP', 'NEW FRONTIER CORP', 'NEW FRONTIER MINING CORP.', 'NEW FRONTIERS CORP']

( 9 / 39 ) STIM
STIM data already exists, skipping...

( 10 / 39 ) TBIO
TBIO data already exists, skipping...

( 11 / 39 ) TCDA
TCDA data already exists, skipping...

( 12 / 39 ) UXIN
possible_companies: ['PUXIN LTD', 'UXIN LTD']

( 13 / 39 ) NTGN
NTGN data already exists, skipping...

( 14 / 39 ) LOVE
LOVE data already exists, skipping...

( 15 / 39 ) HYRE
HYRE data already exists, skipping...

( 16 / 39 ) GLDM
GLDM data already exists, skipping...

(