In [17]:
from sec_edgar_api import EdgarClient
import json
import os
from tqdm import tqdm

In [28]:
def get_filings(ciks, target_year, industry):
    edgar = EdgarClient(user_agent="<Sample Company Name> <Admin Contact>@<Sample Company Domain>")

    # List to collect all 10-K filings for the target year across companies
    all_filings_for_year = []

    # Loop through each CIK (company)
    for cik in ciks:
        try:
            # Get the submission data for the CIK
            data = edgar.get_submissions(cik)

            # Extract the company name (tickers) from the submission data
            company = data.get('tickers', [])[0] if 'tickers' in data else "Unknown_Company"
            
            recent_filings = data.get('filings', {}).get('recent', {})
            forms = recent_filings.get('form', [])
            accession_numbers = recent_filings.get('accessionNumber', [])
            filing_dates = recent_filings.get('filingDate', [])

            # Filter for 10-K filings from the specific year
            ten_k_filings = [
                {"company": company, "accessionNumber": accession, "filingDate": filing_date}
                for form, accession, filing_date in zip(forms, accession_numbers, filing_dates)
                if form == "10-K" and filing_date.startswith(str(target_year))
            ]

            # Add the filtered 10-K filings to the all_filings_for_year list
            all_filings_for_year.extend(ten_k_filings)

            # Prepare formatted URLs for each 10-K filing
            for filing in ten_k_filings:
                accession_number = filing.get("accessionNumber", "")
                filing_date = filing.get("filingDate", "")
                company = filing.get("company", "")
                
                if accession_number and filing_date:
                    year = filing_date[:4]  # Extract the year from the filing date
                    formatted_accession = accession_number.replace("-", "")
                    url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{formatted_accession}/{accession_number}.txt"
                    filing["url"] = url  # Add the URL to the filing

        except Exception as e:
            print(f"Error getting submissions for CIK: {cik}")
            print(e)
            continue

    # Save all 10-K filings for the target year in a single JSON file
    output_file_path = f'./10K_URL_{industry}/All_10K_Filings_{target_year}.json'
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)  # Create directory if it doesn't exist
    
    # Write the accumulated filings to a JSON file
    with open(output_file_path, 'w') as output_file:
        json.dump(all_filings_for_year, output_file, indent=4)

In [31]:
# Industries: TECH, OIL, HEALTH

target_years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
industry = "OIL"
# ciks = ["320193", "1018724", "1045810", "1065280", "1341439", "858877", "1326801", "804328", "1645590", "1535527", "1633917", "1477333", "712515"]
# ciks = ["320193", "1018724", "1045810", "1065280", "1341439", "858877", "1326801", "804328", "1645590", "1535527", "1633917", "1477333", "712515"]

with open("./ciks.json", 'r') as file:
        ciks_data = json.load(file)

ciks = ciks_data[industry]

for year in tqdm(target_years):
    print(f"Getting 10-K filings for the year {year}...")
    get_filings(ciks, year, industry)
    print(f"10-K filings for the year {year} have been saved!")


# "796343", "0000789019, "0001108524",  "0001318605", "51143", "50863", "796343", "2488", "1571996", "1321655",

  0%|          | 0/11 [00:00<?, ?it/s]

Getting 10-K filings for the year 2014...


  9%|▉         | 1/11 [00:07<01:10,  7.06s/it]

10-K filings for the year 2014 have been saved!
Getting 10-K filings for the year 2015...


 18%|█▊        | 2/11 [00:13<01:02,  6.92s/it]

10-K filings for the year 2015 have been saved!
Getting 10-K filings for the year 2016...


 27%|██▋       | 3/11 [00:20<00:52,  6.60s/it]

10-K filings for the year 2016 have been saved!
Getting 10-K filings for the year 2017...


 36%|███▋      | 4/11 [00:26<00:46,  6.57s/it]

10-K filings for the year 2017 have been saved!
Getting 10-K filings for the year 2018...


 45%|████▌     | 5/11 [00:33<00:40,  6.70s/it]

10-K filings for the year 2018 have been saved!
Getting 10-K filings for the year 2019...


 55%|█████▍    | 6/11 [00:40<00:33,  6.74s/it]

10-K filings for the year 2019 have been saved!
Getting 10-K filings for the year 2020...


 64%|██████▎   | 7/11 [00:47<00:27,  6.77s/it]

10-K filings for the year 2020 have been saved!
Getting 10-K filings for the year 2021...


 73%|███████▎  | 8/11 [00:54<00:20,  6.88s/it]

10-K filings for the year 2021 have been saved!
Getting 10-K filings for the year 2022...


 82%|████████▏ | 9/11 [01:01<00:13,  6.99s/it]

10-K filings for the year 2022 have been saved!
Getting 10-K filings for the year 2023...


 91%|█████████ | 10/11 [01:08<00:06,  6.97s/it]

10-K filings for the year 2023 have been saved!
Getting 10-K filings for the year 2024...


100%|██████████| 11/11 [01:15<00:00,  6.88s/it]

10-K filings for the year 2024 have been saved!



