In [9]:
!pip3 install ratelimit



In [2]:
import pandas as pd

import requests
from ratelimit import limits, sleep_and_retry
from bs4 import BeautifulSoup

from tqdm import tqdm

In [3]:
class SecAPI:
    """
    Helper class that caches data from the SEC and avoids exceeding # of calls allowed per second to the website.
    """
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}

    @staticmethod
    @sleep_and_retry
    # Dividing the call limit by half to avoid coming close to the limit
    @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url):
        return requests.get(url)

    def get(self, url):
        return self._call_sec(url).text


def get_sec_data(cik, doc_type, date='2020-12-31', start=0, count=60):
    """

    Parameters
    ----------
    cik: str
        CIK of SEC document. 10 character string
    doc_type: str
        Type of SEC document. ('10-K' or '10-Q')
    date: str
    start: int
    count: int

    Returns
    -------
    List of tuples containing ('filing-href', 'filing-type', 'filing-date')

    """

    # instantiate SecAPI object to help with call limits to SEC website
    sec_api = SecAPI()

    final_date = pd.to_datetime(date)
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    # get data
    sec_data = sec_api.get(rss_url)
    # convert to BeautifulSoup object for parsing
    feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    # list of tuples containing ('filing-href', 'filing-type', 'filing-date')
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        # recursive = False will restrict the search to the first found element and its child only
        for entry in feed.find_all('entry', recursive=False)
        # restrict to files before supplied date
        if pd.to_datetime(entry.content.find('filing-date').getText()) <= final_date]

    return entries

def download_docs(ticker_index, ciks, doc_type):

    sec_api = SecAPI()
    sec_data = {}

    for ticker in ticker_index:
        sec_data[ticker] = get_sec_data(ciks.loc[ticker].values[0], doc_type)

    raw_fillings_by_ticker = {}

    for ticker, data in sec_data.items():
        raw_fillings_by_ticker[ticker] = {}
        for index_url, file_type, file_date in tqdm(data,
                                                    desc='Downloading {} {} Fillings'.format(ticker, doc_type),
                                                    unit='filling'):
            if file_type == doc_type:
                file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')

                raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)

    return raw_fillings_by_ticker


In [4]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Read all ciks
ciks = pd.read_csv('/content/drive/MyDrive/abnormal-distribution-project-data/cik_data/ciks.csv', dtype = 'str', index_col='ticker', usecols=['ticker', 'cik'])
ciks.cik = ciks.cik.str.rjust(10, '0') # Adding padding of 0's to the left to cik is length 10

# Read ciks from sp-500
sp500 = pd.read_csv('/content/drive/MyDrive/abnormal-distribution-project-data/cik_data/sp-components.csv', dtype = 'str', index_col='ticker', usecols=['ticker', 'cik'])
sp500.cik = sp500.cik.str.rjust(10, '0')

# Read stocks removed from sp500 since 2000
sp500_removed = pd.read_csv('/content/drive/MyDrive/abnormal-distribution-project-data/cik_data/sp-removed.csv', dtype = 'str', index_col='ticker', usecols=['ticker'])
# Some components cant be found through ciks anymore as their names changed
sp500_removed[sp500_removed.index.isin(ciks.index)]
# ciks from removed sp-500 companies
sp500_removed = ciks.loc[sp500_removed[sp500_removed.index.isin(ciks.index)].index]


In [6]:
display(sp500.head())
display(sp500_removed.head())

Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
MMM,66740
ABT,1800
ABBV,1551152
ABMD,815094
ACN,1467373


Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
NBL,72207
ETFC,1015780
HRB,12659
COTY,1024305
KSS,885639


In [15]:

ticker_eduardo = sp500.index[:125]
ticker_rohit = sp500.index[125:250]
ticker_jessica = sp500.index[250:375]
ticker_stuart = sp500.index[375:]

ticker_removed1 = sp500_removed.index[:125]
ticker_removed2 = sp500_removed.index[125:]

doc_types = ['10-Q', '10-K']

def download_files(ticker_index):

    for doc_type in doc_types:

        for download_ticker in ticker_index:

          docs = download_docs([download_ticker], sp500, doc_type)

          for ticker in docs.keys():
            
              for date in docs[ticker].keys():

                  text_file = open("/content/drive/MyDrive/abnormal-distribution-project-data/{}/{}-{}-{}.txt".format(doc_type, ticker, doc_type, date), "w")
                  _ = text_file.write(docs[ticker][date])
                  text_file.close()
        

download_files(ticker_eduardo) 
      