# Doawnloads and Stores IPO S-1 Filings

* Uses EDGAR search
* Uses NASDAQ IPO lists
* Skips ambiguous company names (todo manually)

In [22]:
import edgar
from edgar import Edgar
from edgar import Company

import pandas as pd

from pathlib import Path

In [None]:
# params


In [36]:
from datetime import datetime, timedelta
from collections import OrderedDict
dates_range = ["2018-01-01", "2018-06-01"]
start, end = [datetime.strptime(_, "%Y-%m-%d") for _ in dates_range]
OrderedDict(((start + timedelta(_)).strftime(r"%Y-%m"), None) for _ in range((end - start).days)).keys()

odict_keys(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2016-01'])

In [21]:
symbols = list(pd.read_html('https://www.nasdaq.com/markets/ipos/activity.aspx?tab=pricings&month=2018-03')[0]['Symbol'])

In [34]:
df_symbols

Unnamed: 0_level_0,Name,Sector,industry
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GNPX,"Genprex, Inc.",Health Care,Major Pharmaceuticals
IQ,"iQIYI, Inc.",Consumer Services,Consumer Electronics/Video Chains
UMRX,Unum Therapeutics Inc.,Health Care,Major Pharmaceuticals
ONE,OneSmart International Education Group Limited,Consumer Services,Other Consumer Services
OPBK,OP Bancorp,Finance,Major Banks
BILI,Bilibili Inc.,Technology,EDP Services
FIXX,"Homology Medicines, Inc.",Health Care,Major Pharmaceuticals
GHG,GreenTree Hospitality Group Ltd.,Consumer Services,Hotels/Resorts
DBX,"Dropbox, Inc.",Technology,Computer Software: Prepackaged Software
STG,Sunlands Online Education Group,Consumer Services,Other Consumer Services


In [16]:
def get_symbols_df():
    # load symbol lists
    nasdaq = pd.read_csv('http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download')
    nyse = pd.read_csv('http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download')
    amex = pd.read_csv('http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=amex&render=download')
    etf = pd.read_csv('http://www.nasdaq.com/investing/etfs/etf-finder-results.aspx?download=Yes')
    arca = pd.read_excel('https://www.nyse.com/publicdocs/nyse/markets/nyse-arca/NYSE_Arca_Eligible_Securities.xlsx')
    arca.rename(columns={'Security Description' : 'Name', 'Symbol ' : 'Symbol'}, inplace=True)
    us = nasdaq.append(nyse).append(amex).append(etf).append(arca)

    # create symbols df
    df_symbols = us.copy()

    # keep unique assets
    df_symbols = df_symbols.drop_duplicates(subset=['Symbol'])

    # replace uknown sectors
    df_symbols['Sector'].fillna('Other', inplace=True)
    df_symbols.Sector.replace(['n/a'], ['Other'], inplace=True)

    # assign index
    df_symbols.set_index('Symbol', inplace=True)
    df_symbols.sort_index(level=[0], inplace=True)

    # keep only relevant cols
    df_symbols = df_symbols[['Name', 'Sector', 'industry']]

    print('symbols', df_symbols.shape)
    
    return df_symbols

In [17]:
df_symbols = get_symbols_df()

symbols (8800, 3)


In [18]:
df_symbols = df_symbols.loc[symbols]

In [30]:
ipo = {}
counter = 0

for x in df_symbols.index:
    counter += 1
    print('\n( ' + str(counter) + ' / ' + str(df_symbols.shape[0]) + ' ) ' + x)
    
    #check if exists
    if Path("./Data/" + x + ".txt").is_file():
        print(x + ' data already exists, skipping...')
        continue
    # file exists
    
    #create company
    tmpEdgar = Edgar()
    possible_companies = tmpEdgar.findCompanyName(df_symbols.loc[x]['Name'])
    print('possible_companies:', possible_companies)
    
    #validate
    if len(possible_companies) == 0:
        print('no possible companies:', x)
        continue
    #elif len(possible_companies) > 1:
    #    print('too many possible companies:', x)
    #    continue
    
    name = possible_companies[0]
    cik = tmpEdgar.getCikByCompanyName(possible_companies[0])
    company = Company(name, cik)
    
    #look for S-1
    print(company.getFilingsUrl(filingType = "S-1"))
    tree = company.getAllFilings(filingType = "S-1")
    docs = edgar.getDocuments(tree, noOfDocuments=5)
    
    #write file
    if len(docs) > 0:
        with open("./Data/" + x + ".txt", "w", encoding="utf-8") as f:
            f.write(docs[0])
            print('Got data for ' + x)


( 1 / 10 ) GNPX
GNPX data already exists, skipping...

( 2 / 10 ) IQ
possible_companies: ['IQIYI, INC.']
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001722608&type=S-1&dateb=&owner=include&count=100

( 3 / 10 ) UMRX
possible_companies: ['UNUM THERAPEUTICS INC.', 'UNUM THERAPEUTICS, INC.']
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001622229&type=S-1&dateb=&owner=include&count=100

( 4 / 10 ) ONE
possible_companies: []
no possible companies: ONE

( 5 / 10 ) OPBK
possible_companies: ['CAPITAL DEVELOPMENT BANCORP LTD III', 'CAPITAL DEVELOPMENT BANCORP LTD V', 'CAPITAL DEVELOPMENT BANCORP LTD VII', 'CAPITOL DEVELOPMENT BANCORP LTD II', 'CAPITOL DEVELOPMENT BANCORP LTD IV', 'CAPITOL DEVELOPMENT BANCORP LTD V', 'CAPITOL DEVELOPMENT BANCORP LTD VI', 'CAPITOL DEVELOPMENT BANCORP LTD VII', 'CAPITOL DEVELOPMENT BANCORP LTD VIII', 'CHICOPEE BANCORP, INC.', 'CITIZENS BANCORP ESOP & TRUST', 'COHOES BANCORP INC ESOP', 'COLONIAL BANCORP INC QUALIFIED STOCK OP