In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
time.sleep(10)

In [2]:
import requests
import requests_random_user_agent

s = requests.Session()
print(s.headers['User-Agent'])

Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36


In [3]:
# Item 16 not present
#r = requests.get('https://www.sec.gov/Archives/edgar/data/1288776/000165204416000012/goog10-k2015.htm')

# doc_type = 10-K not found, use table info for finding summary
#r = requests.get('https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/aapl-20200926.htm')

# Item 16 returns None as expected and returning texts for other items as well
r = requests.get('https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt')

# Contains item 16
#r = requests.get('https://www.sec.gov/Archives/edgar/data/0001163302/000116330221000013/x-20201231.htm')

# doc_type = 10-K not found, use table info for finding summary, Item-16 not present
#r = requests.get('https://www.sec.gov/Archives/edgar/data/1000045/000156459019023956/nick-10k_20190331.htm')

# Another different type of doc, besides item-16 it has no table of content as well
#r = requests.get('https://www.sec.gov/Archives/edgar/data/1000683/000121390019005351/f10k2018_blondertongue.htm')

raw_10k = r.text

In [4]:
print(raw_10k[0:2000])

<SEC-DOCUMENT>0000320193-18-000145.txt : 20181105
<SEC-HEADER>0000320193-18-000145.hdr.sgml : 20181105
<ACCEPTANCE-DATETIME>20181105080140
ACCESSION NUMBER:		0000320193-18-000145
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		88
CONFORMED PERIOD OF REPORT:	20180929
FILED AS OF DATE:		20181105
DATE AS OF CHANGE:		20181105

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			APPLE INC
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0930

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		181158788

	BUSINESS ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	APPLE COMPUTER INC
		DATE OF NA

In [5]:
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [6]:
# Create 3 lists with the span idices for each regex

### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
### First filter will give us document tag start <end> and document tag end's <start> 
### We will use this to later grab content in between these tags
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
### as section names
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

In [7]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        document[doc_type] = raw_10k[doc_start:doc_end]

In [8]:
# display excerpt the document
document['10-K'][0:500]

'\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>a10-k20189292018.htm\n<DESCRIPTION>10-K\n<TEXT>\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<html>\n\t<head>\n\t\t<!-- Document created using Wdesk 1 -->\n\t\t<!-- Copyright 2018 Workiva -->\n\t\t<title>Document</title>\n\t</head>\n\t<body style="font-family:Times New Roman;font-size:10pt;">\n<div><a name="s3540C27286EF5B0DA103CC59028B96BE"></a></div><div style="line-height:120%;text-align:center;font-size:10pt;"><div sty'

In [9]:
# Write the regex
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8|16)\.{0,1})|(ITEM\s(1A|1B|7A|7|8|16))')

# Use finditer to math the regex
matches = regex.finditer(document['10-K'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(38318, 38327), match='>Item 1A.'>
<re.Match object; span=(39347, 39356), match='>Item 1B.'>
<re.Match object; span=(46148, 46156), match='>Item 7.'>
<re.Match object; span=(47281, 47290), match='>Item 7A.'>
<re.Match object; span=(48357, 48365), match='>Item 8.'>
<re.Match object; span=(60132, 60141), match='>Item 16.'>
<re.Match object; span=(119131, 119140), match='>Item 1A.'>
<re.Match object; span=(197023, 197032), match='>Item 1B.'>
<re.Match object; span=(333318, 333326), match='>Item 7.'>
<re.Match object; span=(729984, 729993), match='>Item 7A.'>
<re.Match object; span=(741774, 741782), match='>Item 8.'>
<re.Match object; span=(2360825, 2360834), match='>Item 16.'>


In [10]:
# Matches
matches = regex.finditer(document['10-K'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,>item 1a.,38318,38327
1,>item 1b.,39347,39356
2,>item 7.,46148,46156
3,>item 7a.,47281,47290
4,>item 8.,48357,48365


In [11]:

# Get rid of unnesesary charcters from the dataframe
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,item1a,38318,38327
1,item1b,39347,39356
2,item7,46148,46156
3,item7a,47281,47290
4,item8,48357,48365


In [12]:
# Drop duplicates
pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

# Display the dataframe
pos_dat

Unnamed: 0,item,start,end
6,item1a,119131,119140
7,item1b,197023,197032
8,item7,333318,333326
9,item7a,729984,729993
10,item8,741774,741782
11,item16,2360825,2360834


In [13]:

# Set item as the dataframe index
pos_dat.set_index('item', inplace=True)

# display the dataframe
pos_dat

Unnamed: 0_level_0,start,end
item,Unnamed: 1_level_1,Unnamed: 2_level_1
item1a,119131,119140
item1b,197023,197032
item7,333318,333326
item7a,729984,729993
item8,741774,741782
item16,2360825,2360834


In [14]:
# Get Item 1a
item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

# Get Item 7
item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]

# Get Item 7a
item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

# Get Item 16
item_16_raw = document['10-K'][pos_dat['start'].loc['item16']:pos_dat['start'].loc['item16']]

In [15]:
print(item_1a_raw[0:1000])
print('*'*50)

item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
print(item_1a_content.prettify()[0:1000])
print('*'*50)

# Final text after cleaning
print(item_1a_content.get_text("\n\n")[0:5000])

>Item 1A.</font></div></td><td style="vertical-align:top;"><div style="line-height:120%;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Risk Factors</font></div></td></tr></table><div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;">The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item&#160;7, &#8220;Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations&#8221; and the consolidated financial statements and related notes in Part II, Item&#160;8, &#8220;Financial Statements and Supplementary Data&#8221; of this Form 10-K.</font></div><div style="line-height:120%;padding-top:16px;text-align:justify;font-size:9pt;"><f

In [16]:
print(item_16_raw[0:1000])
print('*'*50)

item_16_content = BeautifulSoup(item_16_raw, 'lxml')
print(item_16_content.prettify()[0:1000])
print('*'*50)

# Final text after cleaning
print(item_16_content.get_text("\n\n")[0:1500])


**************************************************

**************************************************

