In [2]:
# Importing built-in libraries (no need to install these)
import sys
import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata

# Importing libraries you need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm
import glob
import shutil
import re
from dateutil.parser import parse
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from bs4 import NavigableString

In [3]:
original_directory = "/Users/andrewwang/MyDocuments/10Q_Scraping"
os.chdir(original_directory)

with open("SP500_Tickers.csv") as f:
    tickers = [row.split()[0] for row in f]

In [4]:
def MapTickerToCik(tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    cik_re = re.compile(r'.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in tqdm(tickers): # Use tqdm lib for progress bar
        results = cik_re.findall(requests.get(url.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).lower()] = str(results[0])
    
    return cik_dict

In [5]:
cik_dict = MapTickerToCik(tickers)

100%|██████████| 505/505 [02:36<00:00,  3.23it/s]


In [6]:
# Clean up the ticker-CIK mapping as a DataFrame
ticker_cik_df = pd.DataFrame.from_dict(data=cik_dict, orient='index')
ticker_cik_df.reset_index(inplace=True)
ticker_cik_df.columns = ['ticker', 'cik']
ticker_cik_df['cik'] = [str(cik) for cik in ticker_cik_df['cik']]
ticker_cik_df = ticker_cik_df.set_index('ticker')
ticker_cik_df.head()

Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
mmm,66740
abt,1800
abbv,1551152
abmd,815094
acn,1467373


In [7]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [8]:
def ScrapeDocument(ticker, browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, is10K, num_files_to_scrape):
    
    '''
    Scrapes all 10-Ks and 10-K405s for a particular 
    CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(ticker)
    except OSError:
        text = f"Already made folder for ticker {ticker}"
        WriteLogFile(log_file_name, text)
    
    # If we haven't, go into the directory for that CIK
    os.chdir(ticker)
        
    # Request list of 10-K filings
    res = requests.get(browse_url_base.format(cik))
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base.format(cik)) + '\n'
        WriteLogFile(log_file_name, text)
        return

    # If the request doesn't fail, continue...
    
    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get only 10-K and 10-K405 document filings
    if is10K:
        filings_table = filings_table[(filings_table['Filings'] == '10-K') | (filings_table['Filings'] == '10-K405')]
    else:
        filings_table = filings_table[(filings_table['Filings'] == '10-Q')]
        
    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    num_files_scraped = 0
    
    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base.format(cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base.format(cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        
        filing_date_div = docs_page_soup.find(text=re.compile("Filing (D|d)ate")).parent
        filing_date = filing_date_div.findNext('div').get_text()
        period_of_report_div = docs_page_soup.find(text=re.compile("Period (O|o)f (R|r)eport")).parent
        period_of_report_date = period_of_report_div.findNext('div').get_text()
        
        if is10K:
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Period'] = period_of_report_date
        else:
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Period'] = period_of_report_date
        
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K and 10-K405 entries for the filing
        if is10K:
            docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        else:
            docs_table = docs_table[(docs_table['Type'] == '10-Q')]
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: {} and Acc_No: {} is unavailable'.format(cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        docname = docname.split()[0]
        # Request the file
        file = requests.get(doc_url_base.format(cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base.format(cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
           
        num_files_scraped = num_files_scraped + 1
        
        if num_files_scraped == num_files_to_scrape:
            break
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [10]:
pathname_10k = original_directory + '/10_K_Docs'
pathname_10q = original_directory + '/10_Q_Docs'

In [22]:
def delete_contents(foldername):
    for root, dirs, files in os.walk(foldername):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))
        


delete_contents(pathname_10k)
delete_contents(pathname_10q)

In [None]:
os.chdir(original_directory)
# Run the function to scrape 10-K
# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    ScrapeDocument(ticker=ticker,
                   browse_url_base=browse_url_base_10k, 
                   filing_url_base=filing_url_base_10k, 
                   doc_url_base=doc_url_base_10k, 
                   cik=row['cik'],
                   log_file_name=log_file_name,
                   is10K = True,
                   num_files_to_scrape = 1)
os.chdir(original_directory)

265it [04:48,  1.01s/it]

In [None]:
os.chdir(original_directory)
# Run the function to scrape 10-Qs
# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-Q&count=1000'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    ScrapeDocument(ticker=ticker,
                   browse_url_base=browse_url_base_10q, 
                   filing_url_base=filing_url_base_10q, 
                   doc_url_base=doc_url_base_10q, 
                   cik=row['cik'],
                   log_file_name=log_file_name,
                   is10K = False,
                   num_files_to_scrape = 3)
    
os.chdir(original_directory)

503it [26:55,  3.21s/it]


In [1]:
ticker_cik_df.head()
ticker_cik_df.to_csv('ticker_data.csv')

NameError: name 'ticker_cik_df' is not defined

# Scrape text

In [417]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [712]:
def between(cur, end):
    while cur and cur != end:
        if isinstance(cur, NavigableString):
            text = cur.strip()
            if len(text):
                yield text
        cur = cur.next_element

def get_risk_factor_text(ticker, is10K):
    os.chdir(original_directory)
    if is10K:
        os.chdir(pathname_10k)
    else:
        os.chdir(pathname_10q)
    cik = cik_dict[ticker]
    os.chdir(cik)
    file_name = os.listdir(".")[0]

    with open(file_name) as file:
        soup = bs.BeautifulSoup(file, "html.parser")
    spans = soup.find_all('span')

    risk_factor_span = None
    after_risk_factor_span = None
    
    for span in spans:
        text = span.get_text()
        pattern1A = re.compile("item 1a(.*)")
        if pattern1A.match(text.lower()):
            risk_factor_span = span
        pattern2 = re.compile("item 2(.*)")
        if pattern2.match(text.lower()):
            after_risk_factor_span = span
            
    if not risk_factor_span:
        return []
            
    risk_factor_texts = [text.lower() for text in between(risk_factor_span, after_risk_factor_span)]

    if len(risk_factor_texts) >= 2:
        risk_factor_texts = risk_factor_texts[1:-1]
    
    full_text = ' '.join(risk_factor_texts)
    sentences = split_into_sentences(full_text)
    os.chdir(original_directory)
    return sentences
    


In [717]:
os.chdir(original_directory)

analyzer = SentimentIntensityAnalyzer()

negative_score = -4.0
positive_score = 0.0
uncertain_score = -2.0

negative_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Negative.csv', header=None).iloc[:,0]
positive_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Positive.csv', header=None).iloc[:,0]
uncertain_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Uncertain.csv', header=None).iloc[:,0]

negative_word_scores = {word.lower(): negative_score for word in negative_words}
positive_word_scores = {word.lower(): positive_score for word in positive_words}
uncertain_word_scores = {word.lower(): uncertain_score for word in uncertain_words}

financial_word_dict = {**negative_word_scores, **positive_word_scores, **uncertain_word_scores}
analyzer.lexicon.update(financial_word_dict)

In [748]:
# AAPL
aapl_10k = get_risk_factor_text('aapl', is10K = True)
aapl_text = get_risk_factor_text('aapl', is10K = False)
dal_10k = get_risk_factor_text('dal', is10K = True)
dal_text = get_risk_factor_text('dal', is10K = False)
goog_10k = get_risk_factor_text('hlt', is10K = True)
goog_text = get_risk_factor_text('hlt', is10K = False)

aapl_scores_10k = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_10k}
aapl_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_text}
dal_scores_10k = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_10k}
dal_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_text}
goog_scores_10k = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in goog_10k}
goog_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in goog_text}

print(np.mean(np.asarray(list(aapl_scores_10k.values()))))
print(np.mean(np.asarray(list(aapl_scores.values()))))
print(np.mean(np.asarray(list(dal_scores_10k.values()))))
print(np.mean(np.asarray(list(dal_scores.values()))))
print(np.mean(np.asarray(list(goog_scores_10k.values()))))
print(np.mean(np.asarray(list(goog_scores.values()))))



-0.4563825925925926
-0.5405285714285714
-0.4442688888888889
-0.4400178571428571
-0.48643799126637555
-0.7649075


In [743]:
np.mean(np.asarray([val for val in list(goog_scores.values()) if val < 0 ]))
np.mean(np.asarray([val for val in list(goog_scores_10k.values()) if val < 0 ]))

-0.7348113821138211

In [695]:
analyzer.polarity_scores('There is no assuring that our efforts to obtain such an amendment or waiver would be successful')

{'compound': 0.3851, 'neg': 0.104, 'neu': 0.715, 'pos': 0.181}

In [703]:
analyzer.lexicon['default']

-4.0

In [751]:
ticker_cik_df.loc['aapl']

cik                 0000320193
10-K Filing Date    2020-02-13
10-K Period         2019-12-31
10-Q Filing Date    2020-05-06
10-Q Period         2020-03-31
Name: aapl, dtype: object

In [752]:
-2 >> 1

-1