In [1]:
from os import access
import requests
import pandas as pd

class DataProcessor:

  def __init__(self):
    self.headers = headers = {'User-Agent': "bip.sec22@gmail.com"}

  def getcompanyTickers(self):

    # create request header


    # get all companies data
    companyTickers = requests.get(
        "https://www.sec.gov/files/company_tickers.json",
        headers=self.headers
    )
    return companyTickers.json()

  def getCompanyName(self):
    pass

  def cik_companies(self):
    # format response to dictionary and get first key/value
    companyTickers = self.getcompanyTickers()
    firstEntry = companyTickers['0']

    # parse CIK // without leading zeros
    directCik = companyTickers['0']['cik_str']

    # dictionary to dataframe
    companyData = pd.DataFrame.from_dict(companyTickers,
                                        orient='index')

    # add leading zeros to CIK
    companyData['cik_str'] = companyData['cik_str'].astype(
                              str).str.zfill(10)

    return companyData

  def getFilingMetaData(self):

    companyData = self.cik_companies()
    cik = companyData[0:100].cik_str[0]

    # get company specific filing metadata
    filingMetadata = requests.get(f'https://data.sec.gov/submissions/CIK{cik}.json',headers=self.headers)

    # dictionary to dataframe
    allForms = pd.DataFrame.from_dict(
                filingMetadata.json()['filings']['recent']
                )

    return allForms

  def getCompanyFacts(self):
    # get company facts data
    companyFacts = requests.get(
        f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
        headers=self.headers
        )


  def getFilingsData(self):
      companyData = self.cik_companies()
      all_data = []

      for index, row in companyData.iloc[:101].iterrows(): # upto 100 company
          cik = row['cik_str']
          ticker = row['ticker']

          print(f"Fetching data for CIK: {cik}, Ticker: {ticker}")

          try:
              companyConcept = requests.get(
                  f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/Assets.json',
                  headers=self.headers
              )

              if companyConcept.status_code == 200:
                  company_concept_data = companyConcept.json()
                  if 'units' in company_concept_data and 'USD' in company_concept_data['units']:
                      assetsData = pd.DataFrame.from_dict(company_concept_data['units']['USD'])
                      assetsData['ticker'] = ticker
                      all_data.append(assetsData)
                      print(f"Successfully fetched data for {ticker}")
                  else:
                      print(f"No USD data found for CIK {cik}")
              else:
                  print(f"Failed to fetch data for CIK {cik}, status code: {companyConcept.status_code}")

          except Exception as e:
              print(f"An error occurred while fetching data for CIK {cik}: {e}")

      if all_data:
          df = pd.concat(all_data, ignore_index=True)
      else:
          df = pd.DataFrame()

      return df



  def getFilings10kData(self):
    all_10k_data = []
    assetsData = self.getFilingsData()

    # get assets from 10K forms and reset index
    assets10K = assetsData[assetsData.form == '10-K']
    assets10K = assets10K.reset_index(drop=True)

    all_10k_data.append(assets10K)

    df = pd.concat(all_10k_data, ignore_index=True)

    return df


secObject = DataProcessor()

In [2]:
# get companyNames
companyNames = secObject.cik_companies()
companyNames

Unnamed: 0,cik_str,ticker,title
0,0000789019,MSFT,MICROSOFT CORP
1,0000320193,AAPL,Apple Inc.
2,0001045810,NVDA,NVIDIA CORP
3,0001652044,GOOGL,Alphabet Inc.
4,0001018724,AMZN,AMAZON COM INC
...,...,...,...
10280,0001879814,TLGYW,TLGY ACQUISITION CORP
10281,0001876581,IMPPP,Imperial Petroleum Inc./Marshall Islands
10282,0001871638,BRKHU,BurTech Acquisition Corp.
10283,0001871638,BRKHW,BurTech Acquisition Corp.


In [24]:
# get company tickers
companyTickers = secObject.getcompanyTickers()
# print(companyTickers.keys())

In [4]:
# get company tickers
companyData = secObject.cik_companies()
print(companyData)

          cik_str ticker                                     title
0      0000789019   MSFT                            MICROSOFT CORP
1      0000320193   AAPL                                Apple Inc.
2      0001045810   NVDA                               NVIDIA CORP
3      0001652044  GOOGL                             Alphabet Inc.
4      0001018724   AMZN                            AMAZON COM INC
...           ...    ...                                       ...
10280  0001879814  TLGYW                     TLGY ACQUISITION CORP
10281  0001876581  IMPPP  Imperial Petroleum Inc./Marshall Islands
10282  0001871638  BRKHU                 BurTech Acquisition Corp.
10283  0001871638  BRKHW                 BurTech Acquisition Corp.
10284  0001876431  PRENW                      Prenetics Global Ltd

[10285 rows x 3 columns]


In [5]:
# get company Meta data
companyMetaData = secObject.getFilingMetaData()
companyMetaData

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0000950170-24-077351,2024-06-25,2023-12-31,2024-06-25T16:40:41.000Z,34,11-K,001-37845,241069468,,494656,0,0,msft-11k-401k-pr-2023.htm,11-K
1,0001193125-24-167902,2024-06-25,2023-12-31,2024-06-25T16:33:56.000Z,34,11-K,001-37845,241069378,,2040205,0,0,d840663d11k.htm,11-K
2,0001062993-24-012674,2024-06-14,2024-06-13,2024-06-14T17:59:10.000Z,,4,,,,6279,0,0,xslF345X05/form4.xml,STATEMENT OF CHANGES IN BENEFICIAL OWNERSHIP O...
3,0001062993-24-012673,2024-06-14,2024-06-13,2024-06-14T17:58:49.000Z,,4,,,,6222,0,0,xslF345X05/form4.xml,STATEMENT OF CHANGES IN BENEFICIAL OWNERSHIP O...
4,0001062993-24-012672,2024-06-14,2024-06-13,2024-06-14T17:58:24.000Z,,4,,,,6405,0,0,xslF345X05/form4.xml,STATEMENT OF CHANGES IN BENEFICIAL OWNERSHIP O...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0001626431-18-000007,2018-02-01,2018-01-31,2018-02-01T18:01:05.000Z,,4,,,,7047,0,0,xslF345X03/edgar.xml,PRIMARY DOCUMENT
996,0001626431-18-000006,2018-02-01,2018-01-31,2018-02-01T18:00:06.000Z,,4,,,,7677,0,0,xslF345X03/edgar.xml,PRIMARY DOCUMENT
997,0001193125-18-027418,2018-01-31,,2018-01-31T16:13:02.000Z,34,IRANNOTICE,000-14278,18562869,,4411,0,0,d520423dirannotice.htm,IRANNOTICE
998,0001564590-18-001129,2018-01-31,2017-12-31,2018-01-31T16:10:10.000Z,34,10-Q,001-37845,18562797,,24083544,1,0,msft-10q_20171231.htm,10-Q


In [6]:
companyMetaData["primaryDocument"][2]

'xslF345X05/form4.xml'

In [None]:
# get filings data upto 100 company
filingsData = secObject.getFilingsData()
# filingsData

In [8]:
# filingsData.to_csv("filings_datset.csv")

In [None]:
# get 10k filings data

filings10kData = secObject.getFilings10kData()
filings10kData

In [10]:
# filings10kData.to_csv("filings_10k_dataset.csv")

In [11]:
filings10kData

Unnamed: 0,end,val,accn,fy,fp,form,filed,frame,ticker
0,2009-06-30,77888000000,0001193125-10-171791,2010.0,FY,10-K,2010-07-30,CY2009Q2I,MSFT
1,2010-06-30,86113000000,0001193125-10-171791,2010.0,FY,10-K,2010-07-30,,MSFT
2,2010-06-30,86113000000,0001193125-11-200680,2011.0,FY,10-K,2011-07-28,CY2010Q2I,MSFT
3,2011-06-30,108704000000,0001193125-11-200680,2011.0,FY,10-K,2011-07-28,,MSFT
4,2011-06-30,108704000000,0001193125-12-316848,2012.0,FY,10-K,2012-07-26,CY2011Q2I,MSFT
...,...,...,...,...,...,...,...,...,...
2534,2021-12-31,71132300000,0000080661-22-000046,2021.0,FY,10-K,2022-02-28,,PGR
2535,2021-12-31,71132300000,0000080661-23-000006,2022.0,FY,10-K,2023-02-27,CY2021Q4I,PGR
2536,2022-12-31,75465000000,0000080661-23-000006,2022.0,FY,10-K,2023-02-27,,PGR
2537,2022-12-31,75465000000,0000080661-24-000007,2023.0,FY,10-K,2024-02-26,CY2022Q4I,PGR


In [12]:
companyNames = list(companyNames["ticker"])

In [None]:
companyNames

# SEC Edgar Downloader

In [14]:
!git clone https://github.com/jadchaar/sec-edgar-downloader.git

Cloning into 'sec-edgar-downloader'...
remote: Enumerating objects: 1270, done.[K
remote: Counting objects: 100% (242/242), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 1270 (delta 144), reused 164 (delta 105), pack-reused 1028[K
Receiving objects: 100% (1270/1270), 4.33 MiB | 20.90 MiB/s, done.
Resolving deltas: 100% (724/724), done.


In [15]:
import sys
sys.path.append("/content/sec-edgar-downloader")

In [16]:
!pip install -U sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.1-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.1 sec-edgar-downloader-5.0.2


In [17]:
from sec_edgar_downloader import Downloader

dl = Downloader("OK", "bip.sec22@gmail.com")

# Example

# Get all 10-K filings for Microsoft
dl.get("10-K", "MSFT")

30

In [18]:
# Download filings to the current working directory
dl = Downloader("Biplab", "bip.sec22@gmail.com", "/content/Dataset")

In [19]:
companyTickers = companyNames[:11] # first 10 companies

for ticker in companyTickers:
    dl.get("10-K", ticker)

In [20]:
# Different Usage of SEC Edgar Downloader

# Get all 8-K filings for Apple, including filing amends (8-K/A)
dl.get("8-K", "AAPL", include_amends=True)

# Get all 8-K filings for Apple after January 1, 2017 and before March 25, 2017
# Note: after and before strings must be in the form "YYYY-MM-DD"
dl.get("8-K", "AAPL", after="2017-01-01", before="2017-03-25")

# Get the five most recent 8-K filings for Apple
dl.get("8-K", "AAPL", limit=5)

# Get all 10-K filings for Microsoft
dl.get("10-K", "MSFT")

# Get the latest 10-K filing for Microsoft
dl.get("10-K", "MSFT", limit=1)

# Get all 10-Q filings for Visa
dl.get("10-Q", "V")

# Get all 13F-NT filings for the Vanguard Group
dl.get("13F-NT", "0000102909")

# Get all 13F-HR filings for the Vanguard Group
dl.get("13F-HR", "0000102909")

# Get all SC 13G filings for Apple
dl.get("SC 13G", "AAPL")

# Get all SD filings for Apple
dl.get("SD", "AAPL")