In [30]:
import os
import re

import pandas as pd

from time import time

from bs4 import BeautifulSoup

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def get_documents(text):
    """
    Extract the documents from the text

    Parameters
    ----------
    text : str
        The text with the document strings inside

    Returns
    -------
    extracted_docs : list of str
        The document strings found in `text`
    """
    extracted_docs =[]

    # Write Regexes
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')

    # Create 2 lists with the span indices for each regex
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]

    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
        extracted_docs.append(text[doc_start_i:doc_end_i])
    
    return extracted_docs
    

def get_doc_type(doc):
  
  type_pattern = re.compile(r'<TYPE>[^\n]+')
  doc_type = type_pattern.search(doc).group(0)[len('<TYPE>'):].lower()
  return doc_type

In [3]:
dir_10Q = '/content/drive/MyDrive/abnormal-distribution-project-data/10-Q/'
dir_10K = '/content/drive/MyDrive/abnormal-distribution-project-data/10-K/'

files_10K = os.listdir('/content/drive/MyDrive/abnormal-distribution-project-data/10-K/')
files_10Q = os.listdir('/content/drive/MyDrive/abnormal-distribution-project-data/10-K/')

In [14]:
regex_10k = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

In [66]:
start = time()

dict_10k = {}

errors = []

for i, filename in enumerate(files_10K):

  if i%100 == 0:
    print(i)
    print("Working on: {}, time so far: {}".format(filename, time()-start))

  #if int(filename.split('-')[3]) <= 2010:

   # continue

  # open 10-k text file
  with open(dir_10K + filename) as f:
    docs = f.read()
    # Find 10-k document portion in text file
    docs = get_documents(docs)
    for doc in docs:
      if get_doc_type(doc) == '10-k':
        break

  # Regex for finding sections 1a, 7 and 9
  matches = regex_10k.finditer(doc)

  # try/catch statement will fail if sections 1a, 7, and 9 aren't in the doc
  try:
    
    # Positional df with starting and ending indices for each section
    pos_df = pd.DataFrame([(match.group(), 
                              match.start(), 
                              match.end()) for match in matches])

    pos_df.columns = ['item', 'start', 'end']
    pos_df['item'] = pos_df.item.str.lower()
    pos_df['item'] = pos_df.item.str.replace(' ','')

    # Get rid of unnesesary charcters from the dataframe
    pos_df.replace('&#160;',' ',regex=True,inplace=True)
    pos_df.replace('&nbsp;',' ',regex=True,inplace=True)
    pos_df.replace(' ','',regex=True,inplace=True)
    pos_df.replace('\.','',regex=True,inplace=True)
    pos_df.replace('>','',regex=True,inplace=True)

    # Drop duplicates
    pos_df = pos_df.sort_values('start', 
                                ascending=True).drop_duplicates(subset=['item'], 
                                                                keep='last')
                                
    pos_df.set_index('item', inplace=True)


                                
    # Get Item 1a
    item_1a = doc[pos_df['start'].loc['item1a']:pos_df['start'].loc['item1b']]
    item_1a = BeautifulSoup(item_1a, 'lxml').get_text("\n\n")

    # Get Item 7
    item_7 = doc[pos_df['start'].loc['item7']:pos_df['start'].loc['item7a']]
    item_7 = BeautifulSoup(item_7, 'lxml').get_text("\n\n")

    # Get Item 7a
    item_7a = doc[pos_df['start'].loc['item7a']:pos_df['start'].loc['item8']]
    item_7a = BeautifulSoup(item_7a, 'lxml').get_text("\n\n")

    # Create dictionaries with each parsed document split by sections
    sub_dict = {}

    sub_dict['1a'] = item_1a
    sub_dict['7'] = item_7
    sub_dict['7a'] = item_7a

    dict_10k[filename] = sub_dict.copy()

  except:

    errors.append(filename)


print(time()-start)



0
Working on: WEC-10-K-2005-03-04.txt, time so far: 0.0067081451416015625
100
Working on: WDC-10-K-2004-09-14.txt, time so far: 239.89579033851624
200
Working on: WLTW-10-K-2013-02-28.txt, time so far: 454.9840703010559
300
Working on: YUM-10-K-2009-02-23.txt, time so far: 623.0250127315521


KeyboardInterrupt: ignored

In [57]:
dict_10k.keys()

dict_keys(['WFC-10-K-2020-02-27.txt', 'WFC-10-K-2019-02-27.txt'])

In [130]:

### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
print(item_1a.get_text("\n\n")[0:1500])

>Item 1A.

Risk Factors

Please carefully consider the following discussion of significant factors, events, and uncertainties that make an investment in our securities risky. The events and consequences discussed in these risk factors could, in circumstances we may or may not be able to accurately predict, recognize, or control, have a material adverse effect on our business, growth, reputation, prospects, financial condition, operating results (including components of our financial results), cash flows, liquidity, and stock price. These risk factors do not identify all risks that we face; our operations could also be affected by factors, events, or uncertainties that are not presently known to us or that we currently do not consider to present significant risks to our operations. In addition, the global economic climate amplifies many of these risks.

We Face Intense Competition

Our businesses are rapidly evolving and intensely competitive, and we have many competitors across geograp

In [131]:
print(item_7.get_text("\n\n")[0:1500])

>Item 7.

Management’s Discussion and Analysis of Financial Condition and Results of Operations

Forward-Looking Statements

This Annual Report on Form 10-K includes forward-looking statements within the meaning of the Private Securities Litigation Reform Act of 1995. All statements other than statements of historical fact, including statements regarding guidance, industry prospects, or future results of operations or financial position, made in this Annual Report on Form 10-K are forward-looking. We use words such as anticipates, believes, expects, future, intends, and similar expressions to identify forward-looking statements. Forward-looking statements reflect management’s current expectations and are inherently uncertain. Actual results could differ materially for a variety of reasons, including, among others, fluctuations in foreign exchange rates, changes in global economic conditions and customer spending, world events, the rate of growth of the Internet, online commerce, and cl

In [132]:
print(item_7a.get_text("\n\n")[0:1500])

>Item 7A.

Quantitative and Qualitative Disclosures About Market Risk

We are exposed to market risk for the effect of interest rate changes, foreign currency fluctuations, and changes in the market values of our investments. Information relating to quantitative and qualitative disclosures about market risk is set forth below and in Item 7 of Part II, “Management’s Discussion and Analysis of Financial Condition and Results of Operations — Liquidity and Capital Resources.”

Interest Rate Risk

Our exposure to market risk for changes in interest rates relates primarily to our investment portfolio and our long-term debt. Our long-term debt is carried at amortized cost and fluctuations in interest rates do not impact our consolidated financial statements. However, the fair value of our debt, which pays interest at a fixed rate, will generally fluctuate with movements of interest rates, increasing in periods of declining rates of interest and declining in periods of increasing rates of inte