In [370]:
# Importing built-in libraries (no need to install these)
import sys
import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata

# Importing libraries you need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm
import glob
import shutil
import re
from dateutil.parser import parse
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from bs4 import NavigableString


In [425]:
with open("SP500_Tickers.csv") as f:
    tickers = [row.split()[0] for row in f]

In [427]:
def MapTickerToCik(tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    cik_re = re.compile(r'.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in tqdm(tickers): # Use tqdm lib for progress bar
        results = cik_re.findall(requests.get(url.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).lower()] = str(results[0])
    
    return cik_dict

In [428]:
cik_dict = MapTickerToCik(tickers)

100%|██████████| 505/505 [01:54<00:00,  4.42it/s]


In [429]:
cik_dict

{'a': '0001090872',
 'aal': '0000006201',
 'aap': '0001158449',
 'aapl': '0000320193',
 'abbv': '0001551152',
 'abc': '0001140859',
 'abmd': '0000815094',
 'abt': '0000001800',
 'acn': '0001467373',
 'adbe': '0000796343',
 'adi': '0000006281',
 'adm': '0000007084',
 'adp': '0000008670',
 'ads': '0001101215',
 'adsk': '0000769397',
 'aee': '0001002910',
 'aep': '0000004904',
 'aes': '0000874761',
 'afl': '0000004977',
 'aig': '0000005272',
 'aiv': '0000922864',
 'aiz': '0001267238',
 'ajg': '0000354190',
 'akam': '0001086222',
 'alb': '0000915913',
 'algn': '0001097149',
 'alk': '0000766421',
 'all': '0000899051',
 'alle': '0001579241',
 'alxn': '0000899866',
 'amat': '0000006951',
 'amcr': '0001748790',
 'amd': '0000002488',
 'ame': '0001037868',
 'amgn': '0000318154',
 'amp': '0000820027',
 'amt': '0001053507',
 'amzn': '0001018724',
 'anet': '0001596532',
 'anss': '0001013462',
 'antm': '0001156039',
 'aon': '0000315293',
 'aos': '0000091142',
 'apa': '0000006769',
 'apd': '000000296

In [410]:
# Clean up the ticker-CIK mapping as a DataFrame
ticker_cik_df = pd.DataFrame.from_dict(data=cik_dict, orient='index')
ticker_cik_df.reset_index(inplace=True)
ticker_cik_df.columns = ['ticker', 'cik']
ticker_cik_df['cik'] = [str(cik) for cik in ticker_cik_df['cik']]

In [411]:
ticker_cik_df

Unnamed: 0,ticker,cik
0,aapl,320193
1,dal,27904
2,nvda,1045810
3,mar,1048286


In [412]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [413]:
def ScrapeDocument(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, is10K):
    
    '''
    Scrapes all 10-Ks and 10-K405s for a particular 
    CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(cik)
    except OSError:
        print("Already scraped CIK", cik)
        return
    
    # If we haven't, go into the directory for that CIK
    os.chdir(cik)
    
    print('Scraping CIK', cik)
    
    # Request list of 10-K filings
    res = requests.get(browse_url_base.format(cik))
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base.format(cik)) + '\n'
        WriteLogFile(log_file_name, text)
        return

    # If the request doesn't fail, continue...
    
    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get only 10-K and 10-K405 document filings
    if is10K:
        filings_table = filings_table[(filings_table['Filings'] == '10-K') | (filings_table['Filings'] == '10-K405')]
    else:
        filings_table = filings_table[(filings_table['Filings'] == '10-Q')]
        
    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        # print(filing_url_base.format(cik, acc_no))
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base.format(cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base.format(cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K and 10-K405 entries for the filing
        if is10K:
            docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        else:
            docs_table = docs_table[(docs_table['Type'] == '10-Q')]
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: {} and Acc_No: {} is unavailable'.format(cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        docname = docname.split()[0]
        # Request the file
        file = requests.get(doc_url_base.format(cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            raise Exception("Fuck")
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base.format(cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
            
        break
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [414]:
def delete_contents(foldername):
    for root, dirs, files in os.walk(foldername):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))
        
original_directory = '/Users/andrewwang/MyDocuments/10Q_Scraping'
pathname_10k = original_directory + '/10_K_Docs'
pathname_10q = original_directory + '/10_Q_Docs'

delete_contents(pathname_10k)
delete_contents(pathname_10q)

In [415]:
# Run the function to scrape 10-Ks

# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(ticker_cik_df['cik']):
    ScrapeDocument(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base_10k, 
          doc_url_base=doc_url_base_10k, 
          cik=cik,
          log_file_name=log_file_name,
          is10K = True)

  0%|          | 0/4 [00:00<?, ?it/s]

Scraping CIK 0000320193


 25%|██▌       | 1/4 [00:00<00:02,  1.29it/s]

Scraping CIK 0000027904


 50%|█████     | 2/4 [00:01<00:01,  1.39it/s]

Scraping CIK 0001045810


 75%|███████▌  | 3/4 [00:01<00:00,  1.52it/s]

Scraping CIK 0001048286


100%|██████████| 4/4 [00:02<00:00,  1.47it/s]


In [416]:
# Run the function to scrape 10-Qs

# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-Q&count=1000'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(ticker_cik_df['cik']):
    ScrapeDocument(browse_url_base=browse_url_base_10q, 
          filing_url_base=filing_url_base_10q, 
          doc_url_base=doc_url_base_10q, 
          cik=cik,
          log_file_name=log_file_name,
          is10K = False)

  0%|          | 0/4 [00:00<?, ?it/s]

Scraping CIK 0000320193


 25%|██▌       | 1/4 [00:00<00:02,  1.49it/s]

Scraping CIK 0000027904


 50%|█████     | 2/4 [00:01<00:01,  1.45it/s]

Scraping CIK 0001045810


 75%|███████▌  | 3/4 [00:01<00:00,  1.51it/s]

Scraping CIK 0001048286


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


# Scrape text

In [417]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [418]:
def between(cur, end):
    while cur and cur != end:
        if isinstance(cur, NavigableString):
            text = cur.strip()
            if len(text):
                yield text
        cur = cur.next_element

def get_risk_factor_text(ticker, is10K):
    if is10K:
        os.chdir(pathname_10k)
    else:
        os.chdir(pathname_10q)
    cik = cik_dict[ticker]
    os.chdir(cik)
    file_name = os.listdir(".")[0]

    with open(file_name) as file:
        soup = bs.BeautifulSoup(file, "html.parser")
    spans = soup.find_all('span')
    
#     for span in spans:
#         text = span.get_text()
#         pattern = re.compile("(F|f)or the (F|f)iscal (.*) (E|e)nded")
#         if pattern.match(text):
#             date_span = span.find_next('span')
#             date = date_span.get_text()
#             print(date)
#             date = parse(date)
#             break
            
    for span in spans:
        text = span.get_text()
        pattern1A = re.compile("item 1a(.*)")
        if pattern1A.match(text.lower()):
            risk_factor_span = span
        pattern1B = re.compile("item 2(.*)")
        if pattern1B.match(text.lower()):
            staff_comment_span = span
            
    risk_factor_texts = [text for text in between(risk_factor_span, staff_comment_span)]

    full_text = ' '.join(risk_factor_texts)
    sentences = split_into_sentences(full_text)
    os.chdir('../..')
    result = {'sentences': sentences}
    return result
    


In [419]:
os.chdir(original_directory)
get_risk_factor_text('aapl',False)

{'sentences': ['Item 1A.',
  'Risk Factors The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described in Part I, Item 1A of the 2019 Form 10-K under the heading “Risk Factors,” any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results.',
  'Any of these factors, in whole or in part, could materially and adversely affect the Company’s business, financial condition, operating results and stock price.',
  'Except as set forth below, there have been no material changes to the Company’s risk factors since the 2019 Form 10-K.',
  'The Company’s business, results of operations, financial condition and stock price have been adversely affected and could in the future be materially adversely affected 

In [420]:
analyzer = SentimentIntensityAnalyzer()


ticker_dict = {}
for ticker,cik in cik_dict.items():
    print(ticker)
    dict_10K = get_risk_factor_text(ticker, is10K = True)
    dict_10Q = get_risk_factor_text(ticker, is10K = False)
    
    dict_to_use = dict_10Q
    sentences = dict_to_use['sentences']
    #print(sentences)
    print(len(sentences))
    sentence_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in sentences}
    print(np.mean(np.asarray([score for sentence,score in sentence_scores.items()])))
    ticker_dict[ticker] = {
        'cik': cik,
        'date': date,
        'sentence_scores': sentence_scores
    }
    print()

#print(ticker_dict)
    
        
    

aapl
21
-0.10584761904761908

dal
57
-0.017238596491228084

nvda
75
-0.1312148648648649

mar


UnboundLocalError: local variable 'risk_factor_span' referenced before assignment

In [None]:
cik_dict