In [1]:
from io import BytesIO
from zipfile import ZipFile, BadZipFile
import requests
from datetime import date, datetime
from pathlib import Path
import pandas_datareader.data as web
import datetime
import pandas as pd
import json
import re
from pprint import pprint
from bs4 import BeautifulSoup
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker

In [4]:
data_path = Path('data')
print(data_path)

data


In [46]:
SEC_URL = 'https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/'

In [47]:
today = pd.Timestamp(date.today())
print(pd.Timestamp(date.today()))
this_year = today.year
this_quater = today.quarter

past_years = range(2020, this_year)
filing_periods = [(y,q) for y in past_years for q in range(1,5)]
filing_periods.extend([(this_year, q) for q in range(1, this_quater+1)])
print(f'filing_periods : {filing_periods}')

2021-08-14 00:00:00
filing_periods : [(2020, 1), (2020, 2), (2020, 3), (2020, 4), (2021, 1), (2021, 2), (2021, 3)]


In [51]:
for i, (yr, qtr) in enumerate(filing_periods, 1) :
    print(yr, qtr, end=' ', )
    filing = f'{yr}q{qtr}_notes.zip'
    path = data_path / f'{yr}_{qtr}' / 'source'
    print(path)
    if not path.exists():
        path.mkdir(exist_ok=True, parents=True)

2020 1 data\2020_1\source
2020 2 data\2020_2\source
2020 3 data\2020_3\source
2020 4 data\2020_4\source
2021 1 data\2021_1\source
2021 2 data\2021_2\source
2021 3 data\2021_3\source


In [60]:
for i, (yr, qtr) in enumerate(filing_periods, 1):
    print(yr, qtr, end=' ', )
    filing = f'{yr}q{qtr}_notes.zip'
    path = data_path / f'{yr}_{qtr}' / 'source'
    if not path.exists():
        path.mkdir(exist_ok=True, parents=True)

    response = requests.get(SEC_URL + filing).content
    print(f'response : {response}')
    try:
        with ZipFile(BytesIO(response)) as zip_file:
            for file in zip_file.namelist():
                local_file = path / file
                print(local_file)
                if local_file.exists():
                    continue
                with local_file.open('wb') as output:
                    for line in zip_file.open(file).readlines():
                        print(line)
                        output.write(line)
    except BadZipFile:
        continue

In [5]:
for f in data_path.glob('**/*.tsv') :
    file_name = f.stem + '.parquet'
    path = Path(f.parents[1]) / 'parquet'
    print(f)
    if (path / file_name).exists() :
        continue
    if not path.exists():
        path.mkdir(exist_ok=True)
    try:
        df = pd.read_csv(f, sep='\t', encoding='latin1', low_memory=False)
    except:
        print(f)
    df.to_parquet(path / file_name)

data\2020_1\source\cal.tsv
data\2020_1\source\dim.tsv
data\2020_1\source\num.tsv
data\2020_1\source\pre.tsv
data\2020_1\source\ren.tsv
data\2020_1\source\sub.tsv
data\2020_1\source\tag.tsv
data\2020_1\source\txt.tsv


In [6]:
sub = pd.read_parquet(data_path / '2020_1' / 'parquet' / 'sub.parquet')
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13561 entries, 0 to 13560
Data columns (total 40 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   adsh         13561 non-null  object 
 1   cik          13561 non-null  int64  
 2   name         13561 non-null  object 
 3   sic          13557 non-null  float64
 4   countryba    13545 non-null  object 
 5   stprba       12623 non-null  object 
 6   cityba       13544 non-null  object 
 7   zipba        13542 non-null  object 
 8   bas1         13545 non-null  object 
 9   bas2         5409 non-null   object 
 10  baph         13547 non-null  object 
 11  countryma    13489 non-null  object 
 12  stprma       12636 non-null  object 
 13  cityma       13489 non-null  object 
 14  zipma        13477 non-null  object 
 15  mas1         13487 non-null  object 
 16  mas2         5354 non-null   object 
 17  countryinc   12378 non-null  object 
 18  stprinc      11710 non-null  object 
 19  ein 

In [43]:
sub[sub.name.str.contains('CORPORATION')].loc[:, ['cik', 'name', 'form', 'instance']]
# sub[sub['name'] == 'Microsoft Corporation'].loc[:, ['cik', 'name', 'form', 'instance']]

Unnamed: 0,cik,name,form,instance
1001,311094,WESTAMERICA BANCORPORATION,8-K,f8k_011620_htm.xml
1150,109380,"ZIONS BANCORPORATION, NATIONAL ASSOCIATION /UT/",8-K,zion-201912318xkcoverp_htm.xml
1459,311094,WESTAMERICA BANCORPORATION,8-K,f8k_012320_htm.xml
1483,1212545,WESTERN ALLIANCE BANCORPORATION,8-K,coverpage-pressrelease_htm.xml
1508,1614184,CADENCE BANCORPORATION,8-K,cade-8k_20200123_htm.xml
2379,1767837,"RICHMOND MUTUAL BANCORPORATION, INC.",8-K,rmbi-20200131_htm.xml
2871,1614184,CADENCE BANCORPORATION,8-K,cade-8k_20200204_htm.xml
3129,109380,"ZIONS BANCORPORATION, NATIONAL ASSOCIATION /UT/",8-K,zions-20200205_htm.xml
3371,1212545,WESTERN ALLIANCE BANCORPORATION,8-K,a8-kdividendannounceme_htm.xml
4114,1212545,WESTERN ALLIANCE BANCORPORATION,8-K,investorpresentationfe_htm.xml


In [7]:
name = 'APPLE INC'
key_cols = ['name', 'adsh', 'cik', 'sic', 'countryba', 'stprba',
            'cityba', 'zipba', 'bas1', 'form', 'period', 'fy', 'fp', 'filed']
sub_temp = sub[key_cols].copy()
apple = sub_temp[sub_temp.name == name].T.dropna(how='any', axis=1).squeeze()
apple

name                    APPLE INC
adsh         0000320193-20-000010
cik                        320193
sic                        3571.0
countryba                      US
stprba                         CA
cityba                  CUPERTINO
zipba                       95014
bas1           ONE APPLE PARK WAY
form                         10-Q
period                   20191231
fy                         2020.0
fp                             Q1
filed                    20200129
Name: 1934, dtype: object

In [23]:
len(sub_temp.cik.unique().tolist())

5480

In [8]:
aapl_sub = sub[(sub.cik.astype(int)==apple.cik)&(sub.form.isin(['10-Q', '10-K']))]
aapl_sub

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,accepted,prevrpt,detail,instance,nciks,aciks,pubfloatusd,floatdate,floataxis,floatmems
1934,0000320193-20-000010,320193,APPLE INC,3571.0,US,CA,CUPERTINO,95014,ONE APPLE PARK WAY,,...,2020-01-28 18:03:00.0,0,1,a10-qq1202012282019_htm.xml,1,,,,,


In [9]:
num = pd.read_parquet(data_path/'2020_1'/'parquet'/ 'num.parquet')
aapl_num = num[num.adsh.isin(aapl_sub.adsh)]
aapl_num.ddate = pd.to_datetime(aapl_num.ddate, format='%Y%m%d')
aapl_num.to_parquet(data_path / 'aapl_nums.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [16]:
eps = aapl_num[(aapl_num.tag == 'EarningsPerShareDiluted')
                & (aapl_num.qtrs == 1)].drop('tag', axis=1)
eps = eps.groupby('adsh').apply(lambda x: x.nlargest(n=1, columns=['ddate']))
eps

Unnamed: 0_level_0,Unnamed: 1_level_0,adsh,version,ddate,qtrs,uom,dimh,iprx,value,footnote,footlen,dimn,coreg,durp,datp,dcml
adsh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0000320193-20-000010,1511138,0000320193-20-000010,us-gaap/2019,2019-12-31,1,USD,0x00000000,0,4.99,,0,0,,0.013699,3.0,2
