In [1]:
# Import packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import unicodedata
import codecs
import re

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
def _process_text(text):
    """
        Preprocess Text
    """
    text = unicodedata.normalize("NFKD", text) # Normalize
    text = '\n'.join(text.splitlines()) # Let python take care of unicode break lines

    # Convert to upper
    text = text.upper() # Convert to upper

    # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing
    text = re.sub(r'[ ]+\n', '\n', text)
    text = re.sub(r'\n[ ]+', '\n', text)
    text = re.sub(r'\n+', '\n', text)

    # To find MDA section, reformat item headers
    text = text.replace('\n.\n','.\n') # Move Period to beginning

    text = text.replace('\nI\nTEM','\nITEM')
    text = text.replace('\nITEM\n','\nITEM ')
    text = text.replace('\nITEM  ','\nITEM ')

    text = text.replace(':\n','.\n')

    # Math symbols for clearer looks
    text = text.replace('$\n','$')
    text = text.replace('\n%','%')

    # Reformat
    text = text.replace('\n','\n\n') # Reformat by additional breakline

    return text

In [10]:
headers = {'User-Agent':"AnthonyNing/1.0 (pn2189@nyu.edu; For educational purposes)"}

def download_txt(url, doc_perm, f_type):
    
    fname = '_'.join(url.split('/')[-2:])
    fname, ext = os.path.splitext(fname)
    text_path = os.path.join('./txt', str(doc_perm)+'-'+f_type+'.txt')
    
    if os.path.exists(text_path):
        print("Already exists, skipping {}".format(url))
        
    else:
        print("Downloading & Parsing {}".format(url))
        
        r = requests.get(url, headers=headers)
        try:
            # Parse html with Beautiful Soup
            soup = BeautifulSoup( r.content, "html.parser" )
            text = soup.get_text("\n")
    
            # Process Text
            text = _process_text(text)
            
            # Write to file
            with codecs.open(text_path,'w',encoding='utf-8') as fout:
                fout.write(text)
            return True
        
        except BaseException as e:
            print("{} parsing failed: {}".format(url,e))
            return False

In [11]:
# Read data
sp_files = pd.read_excel('sp500_test.xlsx')

In [12]:
prefix = 'http://www.sec.gov/Archives'

company_name = []
tik_lst = []
cik_lst = []
per_lst = []
time = []
form = []
file = []
  
for i in range(len(sp_files)):
    
    url = os.path.join(prefix, sp_files.loc[i, 'filename'])
    success = download_txt(url, sp_files.loc[i, 'permno'], sp_files.loc[i, 'form_type'])
    
    if success:
        company_name.append(sp_files.loc[i, 'company_name'])
        tik_lst.append(sp_files.loc[i, 'ticker'])
        cik_lst.append(sp_files.loc[i, 'cik'])
        per_lst.append(sp_files.loc[i, 'permno'])
        time.append(sp_files.loc[i, 'filed_date'])
        form.append(sp_files.loc[i, 'form_type'])
        file.append(sp_files.loc[i, 'filename'])

Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1048911/0000950170-23-071495.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1090872/0001090872-23-000020.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1341439/0000950170-23-069682.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1467373/0001467373-23-000403.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1596783/0001596783-23-000185.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1596783/0001596783-23-000186.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1601046/0001601046-23-000134.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1645590/0001645590-23-000117.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/16732/0000016732-23-000176.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/1730168/0001730168-23-000096.txt
Downloading & Parsing http://www.sec.gov/Archives/edgar/data/2

In [13]:
# New dataframe containing company information whose 10-K or 10-Q files have been successfully downloaded
file_dict = {'form_type': form, 'company_name': company_name, 'permno': per_lst, 
             'ticker':tik_lst, 'cik': cik_lst, 'filed_date': time, 'filename': file}
df = pd.DataFrame(file_dict)
# Write data into csv format
df.to_csv('sp500_test_2.csv', index=False)