# 1. Data Collection - Part 1

In this notebook, I will scrape the data from EDGAR website and save individual company reports in text files for further formatting.

## Table of Contents

1. [Get a list of filings](#Get-a-list-of-filings)
1. [Parse a filing](#Parse-a-filing)
1. [Download reports](#Download-reports)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import html5lib
import numpy as np
from IPython.display import display, Markdown
import time
import sys

## Get a list of filings

In [2]:
FIELDS = ['Company Name', 'Form Type', 'CIK', 'Date Filed', 'URL']

def get_field_coords(field_names_str):
    '''Gets the index of the field in a list of all fields.'''

    coords = []
    for field_name in FIELDS:
        start = field_names_str.index(field_name)
        coords.append(start)

    return coords

def unpack_fields(fields_str, coords):
    '''Pulls values from a table based on its string representation and coordinates of its fields.'''
    
    fields = [fields_str[start:end] for start, end in zip(coords[:-1], coords[1:])]
    fields += [fields_str[coords[-1]:]]
    fields = [f.strip() for f in fields]
    
    return fields

In [3]:
def get_crawler_index(year=2019, quarter=2):
    '''Get the filings index from EDGAR website.'''

    url = 'https://www.sec.gov/Archives/edgar/full-index/{}/QTR{}/crawler.idx'.format(year, quarter)
    r = requests.get(url)
    
    return r.text

In [4]:
def get_n_filings(year=2019, quarter=2, n = 100):
    '''Pulls a given number of filings from the index for a given year and quarter.'''
    
    index_str = get_crawler_index(year, quarter)
    coords = get_field_coords(index_str.split('\n')[7])
    
    filings = []
    text = iter(index_str.split('\n')[9:])
    while len(filings) < n:
        line = next(text)
        fields = unpack_fields(line, coords)
        if fields[1]=='10-K':
            filings.append(fields)
    return filings

In [5]:
def get_all_filings(year=2019, quarter=2):
    '''Pulls all filings from the index for a given year and quarter.'''
    
    index_str = get_crawler_index(year, quarter)
    coords = get_field_coords(index_str.split('\n')[7])

    filings = []
    for line in index_str.split('\n')[9:]:
        fields = unpack_fields(line, coords)
        if fields[1]=='10-K':
            filings.append(fields)
    return filings

In [6]:
# pull all filings from the website
filings = get_all_filings()
print('Total filings found: {}'.format(len(filings)))
filings[:5]

Total filings found: 1117


[['12 Retech Corp',
  '10-K',
  '1627611',
  '2019-04-15',
  'https://www.sec.gov/Archives/edgar/data/1627611/0001493152-19-005320-index.htm'],
 ['1847 Holdings LLC',
  '10-K',
  '1599407',
  '2019-04-15',
  'https://www.sec.gov/Archives/edgar/data/1599407/0001477932-19-001700-index.htm'],
 ['1895 Bancorp of Wisconsin, Inc.',
  '10-K',
  '1751692',
  '2019-04-01',
  'https://www.sec.gov/Archives/edgar/data/1751692/0001193125-19-094174-index.htm'],
 ['4M Carbon Fiber Corp.',
  '10-K',
  '1635965',
  '2019-04-17',
  'https://www.sec.gov/Archives/edgar/data/1635965/0001635965-19-000011-index.htm'],
 ['8X8 INC /DE/',
  '10-K',
  '1023731',
  '2019-05-21',
  'https://www.sec.gov/Archives/edgar/data/1023731/0001023731-19-000037-index.htm']]

## Parse a filing

In [7]:
def get_10k_url(filing_url):
    '''Parses 10-k URL from the filing given a filing URL.'''
    
    domain_url = 'https://www.sec.gov'
    r = requests.get(url=filing_url)
    filing_soup = BeautifulSoup(r.text, features='html.parser')
    table = filing_soup.find("table", class_='tableFile')
    url_10k = table.find('td', text='10-K').parent.a.get('href')

    return domain_url + url_10k

In [8]:
def get_html_from_url(url):
    '''Pulls an HTML of the report from the website given a URL.'''
    
    r = requests.get(url_10k)
    
    return r.text

In [9]:
def get_text_from_html(html):
    '''Parses HTML into plain text without tags.'''
    
    soup = BeautifulSoup(html, features='html.parser')
    
    return soup.get_text('\n')

In [10]:
def make_dir(directory):
    '''Creates a local folder on the hard drive.'''
    
    if not os.path.exists(directory):
        os.mkdir(directory)

In [11]:
def save_file(name, contents):
    '''Saves a file given a file name and file contents.'''
    
    with open(name, 'w') as file:
        file.write(contents)

In [12]:
def wait(delays = [7, 4, 6, 2, 10, 19]):
    '''Pauses execution for a pre-determined amount of time.'''
    
    delay = np.random.choice(delays)
    time.sleep(delay)

## Download reports

In [13]:
# download all reports and save into individual folders on the hard drive
make_dir('files')
for i, filing in enumerate(filings):
    print('Processing filing #{}: {} ...              '.format(i+1, filing[0]), end='\r', flush=True)
    path = 'files/'+filing[0].replace('/','_')
    make_dir(path)
    wait()
    url_10k = get_10k_url(filing[-1])
    save_file(path+'/filing.txt', '\n'.join(filing+[url_10k]))
    wait()
    html_10k = get_html_from_url(url_10k)
    save_file(path+'/10k.html', html_10k)
    text_10k = get_text_from_html(html_10k)
    save_file(path+'/10k.txt', text_10k)

Processing filing #1117: urban-gro, Inc. ...                                                          