In [1]:
from google.colab import drive
import os

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Change directory to where your notebook is located
notebook_directory = '/content/drive/My Drive/Colab Notebooks/2024 NLP GenAI/NLP/project'
os.chdir(notebook_directory)

# Step 3: Verify the current working directory
print("Current working directory:", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/My Drive/Colab Notebooks/2024 NLP GenAI/NLP/project


In [2]:
import sys
sys.path.append('./finance_llm_data-main')

In [3]:
!pip install numpy==1.26.4



In [4]:
!pip install -q -r finance_llm_data-main/requirements.txt

In [5]:
%%capture
!sudo apt-get install wkhtmltopdf

In [6]:
import pickle
import time
from pathlib import Path

In [7]:
# numpy errors but will go away after restarting the session
from finance_data import get_data
from marker_sec_src import sec_save_pdfs

In [8]:
ticker = 'Dell'
year = '2024'
filing_types = ['10-Q']
include_amends = False

In [9]:
def scrape_sec_data(ticker, year, filing_types, include_amends, output_dir='./output'):

    start_time = time.time()

    # earnings data
    earnings_docs,\
    earnings_call_quarter_vals,\
    speakers_list_1,\
    speakers_list_2,\
    speakers_list_3,\
    speakers_list_4 = get_data(
        ticker=ticker,
        year=year,
        data_source='earnings_calls'
    )
    print(f"length of earnings_docs: {len(earnings_docs)}")

    data_to_save = {
        'earnings_docs': earnings_docs,
        'earnings_call_quarter_vals': earnings_call_quarter_vals,
        'speakers_list_1': speakers_list_1,
        'speakers_list_2': speakers_list_2,
        'speakers_list_3': speakers_list_3,
        'speakers_list_4': speakers_list_4
    }

    file_path = Path(output_dir) / 'earnings_call' / f'{ticker}-{year}' / 'earnings_calls.pkl'
    file_path.parent.mkdir(parents=True, exist_ok=True)

    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

    # unstructured sec data
    sec_data,sec_form_names = get_data(
    ticker=ticker,
    year=year,
    data_source='unstructured',
    include_amends=include_amends,
    filing_types=filing_types
    )

    data_to_save = {
        'sec_data': sec_data,
        'sec_form_names': sec_form_names
    }

    file_path = Path(output_dir) / 'unstructured_sec_data' / f'{ticker}-{year}' / 'unstructured_sec_data.pkl'
    file_path.parent.mkdir(parents=True, exist_ok=True)

    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

    print(f"length of unstructured_sec_data: {len(sec_data)}")

    # sec filing pdf to markdown files (aka marker data)

    html_urls, metadata_json, metadata_file_path, input_ticker_year_path = sec_save_pdfs(
            ticker, year, filing_types, include_amends
        )
    print(f"metadata_file_path: {metadata_file_path}")
    print(f"input_ticker_year_path: {input_ticker_year_path}")

    get_data(ticker=ticker,
         year=year,
         data_source='marker_pdf',
         batch_processing=False,
         batch_multiplier=1)

    end_time = time.time()
    execution_time = end_time - start_time

    print(f"Execution time: {round(execution_time/60, 2)} minutes")

In [None]:
scrape_sec_data(ticker, year, filing_types, include_amends, output_dir='./output')

Earnings Call Q1
Earnings Call Q2
Earnings Call Q3
Earnings Call Q4
length of earnings_docs: 265
Started Scraping
Scraped
Started Extracting
Extracted
length of unstructured_sec_data: 0
metadata_file_path: output/SEC_EDGAR_FILINGS/Dell-2024/metadata.json
input_ticker_year_path: output/SEC_EDGAR_FILINGS/Dell-2024
Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec on device cuda with dtype torch.float16
Loaded texify model to cuda with torch.float16 dtype


  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 21/21 [00:24<00:00,  1.17s/it]
Detecting bboxes: 100%|██████████| 14/14 [00:24<00:00,  1.74s/it]
Finding reading order: 100%|██████████| 14/14 [00:15<00:00,  1.09s/it]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/Dell-2024/dell-20240503-10-Q2 folder


  self.pid = os.fork()
  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 42/42 [00:47<00:00,  1.13s/it]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]
Detecting bboxes:  93%|█████████▎| 26/28 [00:48<00:04,  2.27s/it]

In [8]:
import itertools

ticker = ['INTC', 'QCOM', 'AVGO', 'DELL', 'HPE']
year = ['2024']
filing_types = [['10-K','10-Q']]
include_amends = [False]

combinations = list(itertools.product(ticker, year, filing_types, include_amends))

# Print the combinations
for combo in combinations:
    print(combo)

('INTC', '2024', ['10-K', '10-Q'], False)
('QCOM', '2024', ['10-K', '10-Q'], False)
('AVGO', '2024', ['10-K', '10-Q'], False)
('DELL', '2024', ['10-K', '10-Q'], False)
('HPE', '2024', ['10-K', '10-Q'], False)


In [None]:
for ticker, year, filing_types, include_amends in combinations:
    print(f"ticker: {ticker}, year: {year}, filing_types: {filing_types}, include_amends: {include_amends}")
    scrape_sec_data(ticker, year, filing_types, include_amends, output_dir='./output')

ticker: INTC, year: 2024, filing_types: ['10-K', '10-Q'], include_amends: False
Earnings Call Q1
Earnings Call Q2
Don't have the data for Q2
Earnings Call Q3
Don't have the data for Q3
Earnings Call Q4
Don't have the data for Q4
length of earnings_docs: 60
Started Scraping
Scraped
Started Extracting
Extracted
length of unstructured_sec_data: 0
metadata_file_path: output/SEC_EDGAR_FILINGS/INTC-2024/metadata.json
input_ticker_year_path: output/SEC_EDGAR_FILINGS/INTC-2024


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16


preprocessor_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16


preprocessor_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/550M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16


preprocessor_config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.91k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Loaded recognition model vikp/surya_rec on device cuda with dtype torch.float16


preprocessor_config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/625M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Loaded texify model to cuda with torch.float16 dtype


preprocessor_config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 12/12 [00:14<00:00,  1.17s/it]
Detecting bboxes: 100%|██████████| 8/8 [00:14<00:00,  1.85s/it]
Finding reading order: 100%|██████████| 8/8 [00:09<00:00,  1.23s/it]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/INTC-2024/intc-20240330-10-Q1 folder
Files have been saved successfully. Check in the folder output/SEC_EDGAR_FILINGS_MD/INTC-2024
Execution time: 3.9 minutes
ticker: QCOM, year: 2024, filing_types: ['10-K', '10-Q'], include_amends: False
Earnings Call Q1
Earnings Call Q2
Earnings Call Q3
Don't have the data for Q3
Earnings Call Q4
Don't have the data for Q4
length of earnings_docs: 100
Started Scraping
Scraped
Started Extracting
Extracted
length of unstructured_sec_data: 0
metadata_file_path: output/SEC_EDGAR_FILINGS/QCOM-2024/metadata.json
input_ticker_year_path: output/SEC_EDGAR_FILINGS/QCOM-2024
Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec on device cuda with dtype torch.float16
Loaded texify mode

  self.pid = os.fork()
  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 13/13 [00:14<00:00,  1.09s/it]
Detecting bboxes: 100%|██████████| 9/9 [00:15<00:00,  1.70s/it]
Finding reading order: 100%|██████████| 9/9 [00:09<00:00,  1.01s/it]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/QCOM-2024/qcom-20240324-10-Q1 folder
Files have been saved successfully. Check in the folder output/SEC_EDGAR_FILINGS_MD/QCOM-2024
Execution time: 3.7 minutes
ticker: AVGO, year: 2024, filing_types: ['10-K', '10-Q'], include_amends: False
Earnings Call Q1
Earnings Call Q2
Earnings Call Q3
Don't have the data for Q3
Earnings Call Q4
Don't have the data for Q4
length of earnings_docs: 132
Started Scraping
Scraped
Started Extracting
Extracted
length of unstructured_sec_data: 0
metadata_file_path: output/SEC_EDGAR_FILINGS/AVGO-2024/metadata.json
input_ticker_year_path: output/SEC_EDGAR_FILINGS/AVGO-2024
Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec on device cuda with dtype torch.float16
Loaded texify mode

  self.pid = os.fork()
  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 16/16 [00:19<00:00,  1.21s/it]
Recognizing Text: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]
Detecting bboxes: 100%|██████████| 11/11 [00:18<00:00,  1.69s/it]
Finding reading order: 100%|██████████| 11/11 [00:11<00:00,  1.01s/it]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/AVGO-2024/avgo-20240505-10-Q2 folder


  self.pid = os.fork()
  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 16/16 [00:16<00:00,  1.06s/it]
Recognizing Text: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it]
Detecting bboxes: 100%|██████████| 11/11 [00:17<00:00,  1.56s/it]
Finding reading order: 100%|██████████| 11/11 [00:10<00:00,  1.03it/s]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/AVGO-2024/avgo-20240204-10-Q1 folder
Files have been saved successfully. Check in the folder output/SEC_EDGAR_FILINGS_MD/AVGO-2024
Execution time: 6.97 minutes
ticker: DELL, year: 2024, filing_types: ['10-K', '10-Q'], include_amends: False
Earnings Call Q1
Earnings Call Q2
Earnings Call Q3
Earnings Call Q4
length of earnings_docs: 265
Started Scraping
Scraped
Started Extracting
Extracted
length of unstructured_sec_data: 0
metadata_file_path: output/SEC_EDGAR_FILINGS/DELL-2024/metadata.json
input_ticker_year_path: output/SEC_EDGAR_FILINGS/DELL-2024
Loaded detection model vikp/surya_det2 on device cuda with dtype torch.float16
Loaded detection model vikp/surya_layout2 on device cuda with dtype torch.float16
Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec on device cuda with dtype torch.float16
Loaded texify model to cuda with torch.float16 dtype


  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 21/21 [00:23<00:00,  1.10s/it]
Detecting bboxes: 100%|██████████| 14/14 [00:23<00:00,  1.69s/it]
Finding reading order: 100%|██████████| 14/14 [00:13<00:00,  1.00it/s]


Saved markdown to the output/SEC_EDGAR_FILINGS_MD/DELL-2024/dell-20240503-10-Q2 folder


  self.pid = os.fork()
Detecting bboxes: 100%|██████████| 42/42 [00:46<00:00,  1.10s/it]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]
Detecting bboxes:  89%|████████▉ | 25/28 [00:46<00:06,  2.27s/it]

Dell 10-K errored out, manually restarted for this one, still maxed out RAM. Let it be.