In [1]:
# Importing built-in libraries (no need to install these)
import sys
import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata

# Importing libraries you need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm
import glob
import shutil
import re
from dateutil.parser import parse
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from bs4 import NavigableString
import html2text
from nltk import tokenize
from collections import namedtuple
import math

In [2]:
original_directory = "/Users/andrewwang/MyDocuments/10Q_Scraping"
os.chdir(original_directory)
pathname_10k = original_directory + '/10_K_Docs'
pathname_10q = original_directory + '/10_Q_Docs'

In [40]:
with open("SP500_Tickers.csv") as f:
    tickers = [row.split()[0] for row in f]

In [41]:
def MapTickerToCik(tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    cik_re = re.compile(r'.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in tqdm(tickers): # Use tqdm lib for progress bar
        results = cik_re.findall(requests.get(url.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).lower()] = str(results[0])
    
    return cik_dict

In [42]:
cik_dict = MapTickerToCik(tickers)

100%|██████████| 505/505 [05:24<00:00,  1.56it/s] 


In [43]:
# Clean up the ticker-CIK mapping as a DataFrame
ticker_cik_df = pd.DataFrame.from_dict(data=cik_dict, orient='index')
ticker_cik_df.reset_index(inplace=True)
ticker_cik_df.columns = ['ticker', 'cik']
ticker_cik_df['cik'] = [str(cik) for cik in ticker_cik_df['cik']]
ticker_cik_df = ticker_cik_df.set_index('ticker')
ticker_cik_df.head()

Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
mmm,66740
abt,1800
abbv,1551152
abmd,815094
acn,1467373


In [44]:
ticker_cik_df.to_csv('ticker_cik_data.csv')

In [45]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [46]:
def ScrapeDocument(ticker, browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, is10K, num_files_to_scrape):
    
    '''
    Scrapes all 10-Ks and 10-K405s for a particular 
    CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    os.chdir(original_directory)
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(ticker)
    except OSError:
        text = f"Already made folder for ticker {ticker}"
        WriteLogFile(log_file_name, text)
    
    # If we haven't, go into the directory for that CIK
    os.chdir(ticker)
        
    # Request list of 10-K filings
    res = requests.get(browse_url_base.format(cik))
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base.format(cik)) + '\n'
        WriteLogFile(log_file_name, text)
        return

    # If the request doesn't fail, continue...
    
    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get only 10-K and 10-K405 document filings
    if is10K:
        filings_table = filings_table[(filings_table['Filings'] == '10-K') | (filings_table['Filings'] == '10-K405')]
    else:
        filings_table = filings_table[(filings_table['Filings'] == '10-Q')]
        
    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    num_files_scraped = 0
    
    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base.format(cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base.format(cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        
        filing_date_div = docs_page_soup.find(text=re.compile("Filing (D|d)ate")).parent
        filing_date = filing_date_div.findNext('div').get_text()
        period_of_report_div = docs_page_soup.find(text=re.compile("Period (O|o)f (R|r)eport")).parent
        period_of_report_date = period_of_report_div.findNext('div').get_text()
        
        if is10K:
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Period'] = period_of_report_date
        else:
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Period'] = period_of_report_date
        
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K and 10-K405 entries for the filing
        if is10K:
            docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        else:
            docs_table = docs_table[(docs_table['Type'] == '10-Q')]
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: {} and Acc_No: {} is unavailable'.format(cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        docname = docname.split()[0]
        # Request the file
        file = requests.get(doc_url_base.format(cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base.format(cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
           
        num_files_scraped = num_files_scraped + 1
        
        if num_files_scraped == num_files_to_scrape:
            break
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [49]:
def delete_contents(foldername):
    for root, dirs, files in os.walk(foldername):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))
        


delete_contents(pathname_10k)
delete_contents(pathname_10q)

In [50]:
os.chdir(original_directory)
# Run the function to scrape 10-K
# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    try:
        ScrapeDocument(ticker=ticker,
                       browse_url_base=browse_url_base_10k, 
                       filing_url_base=filing_url_base_10k, 
                       doc_url_base=doc_url_base_10k, 
                       cik=row['cik'],
                       log_file_name=log_file_name,
                       is10K = True,
                       num_files_to_scrape = 2)
    except:
        WriteLogFile(log_file_name, f"Exception on {ticker}")
os.chdir(original_directory)

500it [45:17,  5.43s/it]


In [51]:
os.chdir(original_directory)
# Run the function to scrape 10-Qs
# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-Q&count=1000'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    try:
        ScrapeDocument(ticker=ticker,
                       browse_url_base=browse_url_base_10q, 
                       filing_url_base=filing_url_base_10q, 
                       doc_url_base=doc_url_base_10q, 
                       cik=row['cik'],
                       log_file_name=log_file_name,
                       is10K = False,
                       num_files_to_scrape = 6)
    except:
        WriteLogFile(log_file_name, f"Exception on {ticker}")
    
os.chdir(original_directory)

500it [1:35:54, 11.51s/it]


In [52]:
ticker_cik_df.to_csv('ticker_data.csv')
ticker_cik_df.head()

Unnamed: 0_level_0,cik,10-K #1 Filing Date,10-K #1 Period,10-K #2 Filing Date,10-K #2 Period,10-Q #1 Filing Date,10-Q #1 Period,10-Q #2 Filing Date,10-Q #2 Period,10-Q #3 Filing Date,10-Q #3 Period,10-Q #4 Filing Date,10-Q #4 Period,10-Q #5 Filing Date,10-Q #5 Period,10-Q #6 Filing Date,10-Q #6 Period
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
mmm,66740,2020-02-06,2019-12-31,2019-02-07,2018-12-31,2020-04-28,2020-03-31,2019-10-25,2019-09-30,2019-07-26,2019-06-30,2019-04-26,2019-03-31,2018-10-25,2018-09-30,2018-07-26,2018-06-30
abt,1800,2020-02-21,2019-12-31,2019-02-22,2018-12-31,2020-04-29,2020-03-31,2019-10-31,2019-09-30,2019-07-31,2019-06-30,2019-05-01,2019-03-31,2018-10-31,2018-09-30,2018-08-01,2018-06-30
abbv,1551152,2020-02-21,2019-12-31,2019-02-27,2018-12-31,2020-05-08,2020-03-31,2019-11-06,2019-09-30,2019-08-05,2019-06-30,2019-05-03,2019-03-31,2018-11-07,2018-09-30,2018-08-07,2018-06-30
abmd,815094,2020-05-21,2020-03-31,2019-05-23,2019-03-31,2020-02-06,2019-12-31,2019-10-31,2019-09-30,2019-08-01,2019-06-30,2019-02-05,2018-12-31,2018-11-06,2018-09-30,2018-08-02,2018-06-30
acn,1467373,2019-10-29,2019-08-31,2018-10-24,2018-08-31,2020-03-19,2020-02-29,2019-12-19,2019-11-30,2019-06-27,2019-05-31,2019-03-28,2019-02-28,2018-12-20,2018-11-30,2018-06-28,2018-05-31


# Scrape text

In [4]:
os.chdir(original_directory)

analyzer = SentimentIntensityAnalyzer()

negative_score = -4.0
positive_score = 0.0
uncertain_score = -2.0

negative_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Negative.csv', header=None).iloc[:,0]
positive_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Positive.csv', header=None).iloc[:,0]
uncertain_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Uncertain.csv', header=None).iloc[:,0]

negative_word_scores = {word.lower(): negative_score for word in negative_words}
positive_word_scores = {word.lower(): positive_score for word in positive_words}
uncertain_word_scores = {word.lower(): uncertain_score for word in uncertain_words}

financial_word_dict = {**negative_word_scores, **positive_word_scores, **uncertain_word_scores}
analyzer.lexicon.update(financial_word_dict)

In [5]:
def get_all_text(ticker, is10K, file_name):
    try:
        os.chdir(original_directory)
        if is10K:
            os.chdir(pathname_10k)
        else:
            os.chdir(pathname_10q)
        os.chdir(ticker)
        file_name_list = list(filter(lambda x: x.endswith(file_name + ".html"), sorted(os.listdir("."))))
        if not file_name_list:
            return []

        full_file_name = file_name_list[0]

        with open(full_file_name) as file:
            soup = bs.BeautifulSoup(file, "html.parser")
        [table.decompose() for table in soup.find_all("table")]

        cleaned = soup.get_text('\n').replace('\n', ' ')
        os.chdir(original_directory)
        return tokenize.sent_tokenize(cleaned)
    except:
        return []

In [6]:
os.chdir(original_directory)

ticker_data = pd.read_csv('ticker_data.csv')
ticker_data = ticker_data.set_index("ticker")
ticker_data.head()

Unnamed: 0_level_0,cik,10-K #1 Filing Date,10-K #1 Period,10-K #2 Filing Date,10-K #2 Period,10-Q #1 Filing Date,10-Q #1 Period,10-Q #2 Filing Date,10-Q #2 Period,10-Q #3 Filing Date,10-Q #3 Period,10-Q #4 Filing Date,10-Q #4 Period,10-Q #5 Filing Date,10-Q #5 Period,10-Q #6 Filing Date,10-Q #6 Period
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
mmm,66740,2020-02-06,2019-12-31,2019-02-07,2018-12-31,2020-04-28,2020-03-31,2019-10-25,2019-09-30,2019-07-26,2019-06-30,2019-04-26,2019-03-31,2018-10-25,2018-09-30,2018-07-26,2018-06-30
abt,1800,2020-02-21,2019-12-31,2019-02-22,2018-12-31,2020-04-29,2020-03-31,2019-10-31,2019-09-30,2019-07-31,2019-06-30,2019-05-01,2019-03-31,2018-10-31,2018-09-30,2018-08-01,2018-06-30
abbv,1551152,2020-02-21,2019-12-31,2019-02-27,2018-12-31,2020-05-08,2020-03-31,2019-11-06,2019-09-30,2019-08-05,2019-06-30,2019-05-03,2019-03-31,2018-11-07,2018-09-30,2018-08-07,2018-06-30
abmd,815094,2020-05-21,2020-03-31,2019-05-23,2019-03-31,2020-02-06,2019-12-31,2019-10-31,2019-09-30,2019-08-01,2019-06-30,2019-02-05,2018-12-31,2018-11-06,2018-09-30,2018-08-02,2018-06-30
acn,1467373,2019-10-29,2019-08-31,2018-10-24,2018-08-31,2020-03-19,2020-02-29,2019-12-19,2019-11-30,2019-06-27,2019-05-31,2019-03-28,2019-02-28,2018-12-20,2018-11-30,2018-06-28,2018-05-31


In [12]:
os.chdir(original_directory)

def is_valid_date(date_str):
    return type(date_str) is str
    

companies = list(ticker_data.index)
rows_list = []
DataSource = namedtuple("DataSource", ["is10K", "documentnumber"])
datasources = [DataSource(True, i) for i in range(1, 3)] + [DataSource(False, i) for i in range(1, 7)]
rows_list = []
for ticker in tqdm(companies):
    for source in datasources:
        prefix = "10-K" if source.is10K else "10-Q"
        as_of_period = ticker_data.loc[ticker, f"{prefix} #{source.documentnumber} Filing Date"]
        observation_period = ticker_data.loc[ticker, f"{prefix} #{source.documentnumber} Period"]
        if not (is_valid_date(observation_period) and is_valid_date(observation_period)):
            continue
        sentences = get_all_text(ticker, source.is10K, as_of_period)
        for i,sentence in enumerate(sentences):
            new_row = {"ticker": ticker,
                       'As Of Period': as_of_period, 
                       'Observation Period': observation_period,
                       'Document Type': prefix,
                       'Sentence ID': i, 
                       'Sentence': sentence[:5000]}
            rows_list.append(new_row)
sentences_df = pd.DataFrame(rows_list)                  

100%|██████████| 500/500 [2:58:11<00:00, 21.38s/it]   


In [13]:
os.chdir(original_directory)

for i, row in tqdm(sentences_df.iterrows()):
    polarity_scores = analyzer.polarity_scores(row['Sentence'])
    sentences_df.at[i, 'compound'] = polarity_scores['compound']
    sentences_df.at[i, 'neg'] = polarity_scores['neg']
    sentences_df.at[i, 'neu'] = polarity_scores['neu']
    sentences_df.at[i, 'pos'] = polarity_scores['pos']
sentences_df.to_csv('sentences_data.csv')
sentences_df.head()

3679838it [26:25, 2320.65it/s]


Unnamed: 0,As Of Period,Document Type,Observation Period,Sentence,Sentence ID,ticker,compound,neg,neu,pos
0,2020-02-06,10-K,2019-12-31,"Common Stock, Par Value $.01 Per S...",0,mmm,0.9246,0.018,0.907,0.076
1,2020-02-06,10-K,2019-12-31,Securities registered pursuant to section 12(g...,1,mmm,0.5267,0.0,0.872,0.128
2,2020-02-06,10-K,2019-12-31,Yes ☒ No ☐ ​ Indicate by check mark ...,2,mmm,0.128,0.071,0.841,0.087
3,2020-02-06,10-K,2019-12-31,Yes ☐ No ☒ ​ Indicate by check mark ...,3,mmm,0.4019,0.031,0.899,0.07
4,2020-02-06,10-K,2019-12-31,Yes ☒ No ☐ ​ Indicate by check mark w...,4,mmm,0.128,0.039,0.914,0.047
