In [9]:
# Importing built-in libraries (no need to install these)
import sys
import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata

# Importing libraries you need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm
import glob
import shutil
import re
from dateutil.parser import parse
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
from bs4 import NavigableString
import html2text
from nltk import tokenize
from collections import namedtuple

In [10]:
original_directory = "/Users/andrewwang/MyDocuments/10Q_Scraping"
os.chdir(original_directory)
pathname_10k = original_directory + '/10_K_Docs'
pathname_10q = original_directory + '/10_Q_Docs'

In [10]:
with open("SP500_Tickers.csv") as f:
    tickers = [row.split()[0] for row in f]

In [11]:
def MapTickerToCik(tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    cik_re = re.compile(r'.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in tqdm(tickers): # Use tqdm lib for progress bar
        results = cik_re.findall(requests.get(url.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).lower()] = str(results[0])
    
    return cik_dict

In [12]:
cik_dict = MapTickerToCik(tickers)

100%|██████████| 505/505 [02:39<00:00,  3.17it/s]


In [6]:
# Clean up the ticker-CIK mapping as a DataFrame
ticker_cik_df = pd.DataFrame.from_dict(data=cik_dict, orient='index')
ticker_cik_df.reset_index(inplace=True)
ticker_cik_df.columns = ['ticker', 'cik']
ticker_cik_df['cik'] = [str(cik) for cik in ticker_cik_df['cik']]
ticker_cik_df = ticker_cik_df.set_index('ticker')
ticker_cik_df.head()

Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
mmm,66740
abt,1800
abbv,1551152
abmd,815094
acn,1467373


In [7]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [8]:
def ScrapeDocument(ticker, browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, is10K, num_files_to_scrape):
    
    '''
    Scrapes all 10-Ks and 10-K405s for a particular 
    CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(ticker)
    except OSError:
        text = f"Already made folder for ticker {ticker}"
        WriteLogFile(log_file_name, text)
    
    # If we haven't, go into the directory for that CIK
    os.chdir(ticker)
        
    # Request list of 10-K filings
    res = requests.get(browse_url_base.format(cik))
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base.format(cik)) + '\n'
        WriteLogFile(log_file_name, text)
        return

    # If the request doesn't fail, continue...
    
    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get only 10-K and 10-K405 document filings
    if is10K:
        filings_table = filings_table[(filings_table['Filings'] == '10-K') | (filings_table['Filings'] == '10-K405')]
    else:
        filings_table = filings_table[(filings_table['Filings'] == '10-Q')]
        
    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    num_files_scraped = 0
    
    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base.format(cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base.format(cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        
        filing_date_div = docs_page_soup.find(text=re.compile("Filing (D|d)ate")).parent
        filing_date = filing_date_div.findNext('div').get_text()
        period_of_report_div = docs_page_soup.find(text=re.compile("Period (O|o)f (R|r)eport")).parent
        period_of_report_date = period_of_report_div.findNext('div').get_text()
        
        if is10K:
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-K #{num_files_scraped + 1} Period'] = period_of_report_date
        else:
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Filing Date'] = filing_date
            ticker_cik_df.at[ticker, f'10-Q #{num_files_scraped + 1} Period'] = period_of_report_date
        
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K and 10-K405 entries for the filing
        if is10K:
            docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        else:
            docs_table = docs_table[(docs_table['Type'] == '10-Q')]
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: {} and Acc_No: {} is unavailable'.format(cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        docname = docname.split()[0]
        # Request the file
        file = requests.get(doc_url_base.format(cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base.format(cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'w')
            html_file.write(file.text)
            html_file.close()
           
        num_files_scraped = num_files_scraped + 1
        
        if num_files_scraped == num_files_to_scrape:
            break
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [22]:
def delete_contents(foldername):
    for root, dirs, files in os.walk(foldername):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))
        


delete_contents(pathname_10k)
delete_contents(pathname_10q)

In [10]:
os.chdir(original_directory)
# Run the function to scrape 10-K
# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    ScrapeDocument(ticker=ticker,
                   browse_url_base=browse_url_base_10k, 
                   filing_url_base=filing_url_base_10k, 
                   doc_url_base=doc_url_base_10k, 
                   cik=row['cik'],
                   log_file_name=log_file_name,
                   is10K = True,
                   num_files_to_scrape = 1)
os.chdir(original_directory)

503it [08:56,  1.07s/it]


In [11]:
os.chdir(original_directory)
# Run the function to scrape 10-Qs
# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-Q&count=1000'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/{}/{}/{}'

# Set correct directory
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for ticker,row in tqdm(ticker_cik_df.iterrows()):
    ScrapeDocument(ticker=ticker,
                   browse_url_base=browse_url_base_10q, 
                   filing_url_base=filing_url_base_10q, 
                   doc_url_base=doc_url_base_10q, 
                   cik=row['cik'],
                   log_file_name=log_file_name,
                   is10K = False,
                   num_files_to_scrape = 3)
    
os.chdir(original_directory)

503it [20:38,  2.46s/it]


In [12]:
ticker_cik_df.to_csv('ticker_data.csv')
ticker_cik_df.head()

Unnamed: 0_level_0,cik,10-K #1 Filing Date,10-K #1 Period,10-Q #1 Filing Date,10-Q #1 Period,10-Q #2 Filing Date,10-Q #2 Period,10-Q #3 Filing Date,10-Q #3 Period
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mmm,66740,2020-02-06,2019-12-31,2020-04-28,2020-03-31,2019-10-25,2019-09-30,2019-07-26,2019-06-30
abt,1800,2020-02-21,2019-12-31,2020-04-29,2020-03-31,2019-10-31,2019-09-30,2019-07-31,2019-06-30
abbv,1551152,2020-02-21,2019-12-31,2020-05-08,2020-03-31,2019-11-06,2019-09-30,2019-08-05,2019-06-30
abmd,815094,2020-05-21,2020-03-31,2020-02-06,2019-12-31,2019-10-31,2019-09-30,2019-08-01,2019-06-30
acn,1467373,2019-10-29,2019-08-31,2020-03-19,2020-02-29,2019-12-19,2019-11-30,2019-06-27,2019-05-31


# Scrape text

In [6]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [43]:
def between(cur, end):
    while cur and cur != end:
        if isinstance(cur, NavigableString):
            text = cur.strip()
            if len(text):
                yield text
        cur = cur.next_element

def get_risk_factor_text(ticker, is10K, files_from_end = 0):
    os.chdir(original_directory)
    if is10K:
        os.chdir(pathname_10k)
    else:
        os.chdir(pathname_10q)
    cik = cik_dict[ticker]
    os.chdir(ticker)
    file_list = sorted(os.listdir("."))
    file_name = file_list[len(file_list) - 1 - files_from_end]

    with open(file_name) as file:
        soup = bs.BeautifulSoup(file, "html.parser")
    spans = soup.find_all('span')

    risk_factor_span = None
    after_risk_factor_span = None
    
    for span in spans:
        text = span.get_text()
        pattern1A = re.compile("item 1a(.*)")
        if pattern1A.match(text.lower()):
            risk_factor_span = span
        pattern2 = re.compile("item 2(.*)")
        if pattern2.match(text.lower()):
            after_risk_factor_span = span
            
    if not risk_factor_span:
        return []
            
    risk_factor_texts = [text.lower() for text in between(risk_factor_span, after_risk_factor_span)]

    if len(risk_factor_texts) >= 2:
        risk_factor_texts = risk_factor_texts[1:-1]
    
    full_text = ' '.join(risk_factor_texts)
    sentences = split_into_sentences(full_text)
    os.chdir(original_directory)
    return sentences
    


In [29]:
os.chdir(original_directory)

analyzer = SentimentIntensityAnalyzer()

negative_score = -4.0
positive_score = 0.0
uncertain_score = -2.0

negative_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Negative.csv', header=None).iloc[:,0]
positive_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Positive.csv', header=None).iloc[:,0]
uncertain_words = pd.read_csv('LoughranMcDonald_SentimentWordLists_Uncertain.csv', header=None).iloc[:,0]

negative_word_scores = {word.lower(): negative_score for word in negative_words}
positive_word_scores = {word.lower(): positive_score for word in positive_words}
uncertain_word_scores = {word.lower(): uncertain_score for word in uncertain_words}

financial_word_dict = {**negative_word_scores, **positive_word_scores, **uncertain_word_scores}
analyzer.lexicon.update(financial_word_dict)

In [50]:
def print_scores(company, date, scores):
    sum_score = np.sum(np.asarray(list(scores.values())))
    print(f"Sentiment scores for {company} on {date}")
    print(f"Sum score = {sum_score}")
    print("-------------------------------------------")
    for sentence, score in scores.items():
        print(f"{sentence} = {score}")
    print()
    

In [51]:
aapl_2019_Q2_text = get_risk_factor_text('aapl', is10K = False, files_from_end = 2)
aapl_2019_Q3_text = get_risk_factor_text('aapl', is10K = False, files_from_end = 1)
aapl_2019_Q4_text = get_risk_factor_text('aapl', is10K = True)
aapl_2020_Q1_text = get_risk_factor_text('aapl', is10K = False)

aapl_2019_Q2_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_2019_Q2_text}
aapl_2019_Q3_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_2019_Q3_text}
aapl_2019_Q4_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_2019_Q4_text}
aapl_2020_Q1_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in aapl_2020_Q1_text}

print_scores("aapl", "2019-Q2", aapl_2019_Q2_scores)
print_scores("aapl", "2019-Q3", aapl_2019_Q3_scores)
print_scores("aapl", "2019-Q4", aapl_2019_Q4_scores)
print_scores("aapl", "2020-Q1", aapl_2020_Q1_scores)

Sentiment scores for aapl on 2019-Q2
Sum score = -127.3578
-------------------------------------------
risk factors the following description of risk factors includes any material changes to, and supersedes the description of, risk factors associated with the company’s business previously disclosed in part i, item 1a of the 2018 form 10-k and in part ii, item 1a of the forms 10-q for the quarters ended december 29, 2018 and march 30, 2019, in each case under the heading “risk factors”. = -0.9325
the business, financial condition and operating results of the company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results. = -0.9266
any of these factors, in whole or in part, could materially and 

effective march 13, 2020, the company temporarily closed all of its retail stores outside of china. = -0.4404
the company has also required substantially all of its employees in all of its offices outside of china to work remotely. = 0.0
additionally, many of the company’s channel partner points of sale outside of china temporarily closed. = -0.7184
as a result, the company also experienced weakened demand for its products and services outside of china during the last three weeks of the quarter. = -0.7579
the covid-19 pandemic has continued to adversely impact demand for certain of the company’s products and services through april 2020. = -0.6597
the company is continuing to monitor the situation and take appropriate actions in accordance with the recommendations and requirements of relevant authorities. = 0.0
the full extent of the impact of the covid-19 pandemic on the company’s operational and financial performance is currently uncertain and will depend on many factors outside the c

In [52]:
dal_2019_Q2_text = get_risk_factor_text('dal', is10K = False, files_from_end = 2)
dal_2019_Q3_text = get_risk_factor_text('dal', is10K = False, files_from_end = 1)
dal_2019_Q4_text = get_risk_factor_text('dal', is10K = True)
dal_2020_Q1_text = get_risk_factor_text('dal', is10K = False)

dal_2019_Q2_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_2019_Q2_text}
dal_2019_Q3_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_2019_Q3_text}
dal_2019_Q4_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_2019_Q4_text}
dal_2020_Q1_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in dal_2020_Q1_text}

print_scores("dal", "2019-Q2", dal_2019_Q2_scores)
print_scores("dal", "2019-Q3", dal_2019_Q3_scores)
print_scores("dal", "2019-Q4", dal_2019_Q4_scores)
print_scores("dal", "2020-Q1", dal_2020_Q1_scores)

Sentiment scores for dal on 2019-Q2
Sum score = -1.3553000000000002
-------------------------------------------
“item 1a. = 0.0
risk factors” of our form 10-k includes a discussion of our risk factors. = -0.7184
there have been no material changes from the risk factors described in our form 10-k. = -0.6369

Sentiment scores for dal on 2019-Q3
Sum score = -1.3553000000000002
-------------------------------------------
“item 1a. = 0.0
risk factors” of our form 10-k includes a discussion of our risk factors. = -0.7184
there have been no material changes from the risk factors described in our form 10-k. = -0.6369

Sentiment scores for dal on 2019-Q4
Sum score = -99.9605
-------------------------------------------
risk factors relating to delta we are at risk of losses and adverse publicity stemming from a serious accident involving our aircraft or aircraft of our airline partners. = -0.9818
an aircraft crash or other serious accident could expose us to significant liability. = -0.9709
alth

In [53]:
hlt_2019_Q2_text = get_risk_factor_text('hlt', is10K = False, files_from_end = 2)
hlt_2019_Q3_text = get_risk_factor_text('hlt', is10K = False, files_from_end = 1)
hlt_2019_Q4_text = get_risk_factor_text('hlt', is10K = True)
hlt_2020_Q1_text = get_risk_factor_text('hlt', is10K = False)

hlt_2019_Q2_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in hlt_2019_Q2_text}
hlt_2019_Q3_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in hlt_2019_Q3_text}
hlt_2019_Q4_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in hlt_2019_Q4_text}
hlt_2020_Q1_scores = {sentence: analyzer.polarity_scores(sentence)['compound'] for sentence in hlt_2020_Q1_text}

print_scores("hlt", "2019-Q2", hlt_2019_Q2_scores)
print_scores("hlt", "2019-Q3", hlt_2019_Q3_scores)
print_scores("hlt", "2019-Q4", hlt_2019_Q4_scores)
print_scores("hlt", "2020-Q1", hlt_2020_Q1_scores)

Sentiment scores for hlt on 2019-Q2
Sum score = -0.8807
-------------------------------------------
as of june 30, 2019 , there have been no material changes from the risk factors previously disclosed in response to "part i —item 1a. = -0.8807

Sentiment scores for hlt on 2019-Q3
Sum score = 0.0
-------------------------------------------

Sentiment scores for hlt on 2019-Q4
Sum score = -222.7886
-------------------------------------------
in addition to the other information in this annual report on form 10-k, the following risk factors should be considered carefully in evaluating our company and our business. = -0.3612
risks related to our business and industry we are subject to the business, financial and operating risks inherent to the hospitality industry, any of which could reduce our revenues and limit opportunities for growth. = -0.5859
our business is subject to a number of business, financial and operating risks inherent to the hospitality industry, including: • significant c

In [11]:
def get_all_text(ticker, is10K, file_name):
    os.chdir(original_directory)
    if is10K:
        os.chdir(pathname_10k)
    else:
        os.chdir(pathname_10q)
    os.chdir(ticker)
    full_file_name = list(filter(lambda x: x.endswith(file_name + ".html"), sorted(os.listdir("."))))[0]

    with open(full_file_name) as file:
        soup = bs.BeautifulSoup(file, "html.parser")
    [table.decompose() for table in soup.find_all("table")]
    
    cleaned = soup.get_text('\n').replace('\n', ' ')
    os.chdir(original_directory)
    return tokenize.sent_tokenize(cleaned)

In [12]:
ticker_data = pd.read_csv('ticker_data.csv')
ticker_data = ticker_data.set_index("ticker")
ticker_data.head()

Unnamed: 0_level_0,cik,10-K #1 Filing Date,10-K #1 Period,10-Q #1 Filing Date,10-Q #1 Period,10-Q #2 Filing Date,10-Q #2 Period,10-Q #3 Filing Date,10-Q #3 Period
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mmm,66740,2020-02-06,2019-12-31,2020-04-28,2020-03-31,2019-10-25,2019-09-30,2019-07-26,2019-06-30
abt,1800,2020-02-21,2019-12-31,2020-04-29,2020-03-31,2019-10-31,2019-09-30,2019-07-31,2019-06-30
abbv,1551152,2020-02-21,2019-12-31,2020-05-08,2020-03-31,2019-11-06,2019-09-30,2019-08-05,2019-06-30
abmd,815094,2020-05-21,2020-03-31,2020-02-06,2019-12-31,2019-10-31,2019-09-30,2019-08-01,2019-06-30
acn,1467373,2019-10-29,2019-08-31,2020-03-19,2020-02-29,2019-12-19,2019-11-30,2019-06-27,2019-05-31


In [16]:
companies = ["aapl", "wmt", "ko", "axp", "aapl", "pg", "googl", "amd", "vz", "ed", "yum", "lb", "mar", "dri", "gm", "cat", "xom", "nke", "mgm", "rcl"]
rows_list = []
DataSource = namedtuple("DataSource", ["is10K", "documentnumber"])
datasources = [DataSource(True, 1), DataSource(False, 1), DataSource(False, 2), DataSource(False, 3)]
rows_list = []
for ticker in companies:
    for source in datasources:
        prefix = "10-K" if source.is10K else "10-Q"
        as_of_period = ticker_data.loc[ticker, f"{prefix} #{source.documentnumber} Filing Date"]
        observation_period = ticker_data.loc[ticker, f"{prefix} #{source.documentnumber} Period"]
        sentences = get_all_text(ticker, source.is10K, as_of_period)
        for i,sentence in enumerate(sentences):
            new_row = {"ticker": ticker,
                       'As Of Period': as_of_period, 
                       'Observation Period': observation_period,
                       'Document Type': prefix,
                       'Sentence ID': i, 
                       'Sentence': sentence}
            rows_list.append(new_row)
    print(ticker)
sentences_df = pd.DataFrame(rows_list)                  

aapl
wmt
ko
axp
aapl
pg
googl
amd
vz
ed
yum
lb
mar
dri
gm
cat
xom
nke
mgm
rcl


In [30]:
analyzer = SentimentIntensityAnalyzer()
for i, row in sentences_df.iterrows():
    polarity_scores = analyzer.polarity_scores(row['Sentence'])
    sentences_df.at[i, 'compound'] = polarity_scores['compound']
    sentences_df.at[i, 'neg'] = polarity_scores['neg']
    sentences_df.at[i, 'neu'] = polarity_scores['neu']
    sentences_df.at[i, 'pos'] = polarity_scores['pos']
sentences_df.to_csv('sentences_data.csv')

In [31]:
sentences_df.head()

Unnamed: 0,As Of Period,Document Type,Observation Period,Sentence,Sentence ID,ticker,compound,neg,neu,pos
0,2019-10-31,10-K,2019-09-28,Document 2100000000 2700...,0,aapl,0.9068,0.0,0.991,0.009
1,2019-10-31,10-K,2019-09-28,Commission File Number: 001-36743 Apple Inc. ...,1,aapl,0.7351,0.0,0.867,0.133
2,2019-10-31,10-K,2019-09-28,Yes ☒ No ☐ Indicate by check mark if ...,2,aapl,0.128,0.074,0.836,0.09
3,2019-10-31,10-K,2019-09-28,Yes ☐ No ☒ Indicate by check mark wh...,3,aapl,0.4019,0.032,0.897,0.071
4,2019-10-31,10-K,2019-09-28,Yes ☒ No ☐ Indicate by check mark whe...,4,aapl,0.128,0.04,0.911,0.049
