# Imports and getting API keys

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from openai import OpenAI
import json
import pydantic
import requests
from tqdm import tqdm
from dotenv import load_dotenv
from collections import Counter
from typing import Optional

load_dotenv(override=True)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai = OpenAI()

try:
    response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"Hello"}], max_tokens=5)
    print("✅ OpenAI API loaded in successfully")
except Exception as e:
    print(f"❌ Some error occured: {e}")


✅ OpenAI API loaded in successfully


# Logging in to HuggingFace and loading the dataset

In [3]:
from huggingface_hub import login
from datasets import load_dataset

try:
    HF_TOKEN = os.getenv("HF_TOKEN")
    login(token=HF_TOKEN)
    print("✅ Successfully logged in to HuggingFace")
except Exception as e:
    print(f"❌ Some error occured: {e}")
try:    
    dataset = load_dataset("alea-institute/kl3m-index-edgar-filings", split="train")
    print("✅ Successfully loaded in the dataset")
except Exception as e:
    print(f"❌ Some error occured: {str(e)}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


✅ Successfully logged in to HuggingFace
✅ Successfully loaded in the dataset


In [4]:
print(f"We have loaded in a dataset with a grand total of {len(dataset)} datapoints")

# ^ Let's curate this dataset and then store it into a vectorstore
# ^ Let's filter the dataset to only include 10-K, 10-Q and 8-K forms
# ^ Let's view the columns of the dataset first
print(dataset)
print(dataset.column_names)
print(dataset[0])

We have loaded in a dataset with a grand total of 19953313 datapoints
Dataset({
    features: ['kl3m_id', 'cik', 'name', 'filingDate', 'form', 'accessionNumber', 'core_type', 'primaryDocDescription', 'items', 'entityType', 'ownerOrg', 'ein', 'description', 'website', 'investorWebsite', 'phone', 'tickers', 'exchanges', 'sic', 'sicDescription', 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'formerNames', 'reportDate', 'acceptanceDateTime', 'act', 'fileNumber', 'filmNumber', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument'],
    num_rows: 19953313
})
['kl3m_id', 'cik', 'name', 'filingDate', 'form', 'accessionNumber', 'core_type', 'primaryDocDescription', 'items', 'entityType', 'ownerOrg', 'ein', 'description', 'website', 'investorWebsite', 'phone', 'tickers', 'exchanges', 'sic', 'sicDescription', 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'formerNames', 'reportDate', 'acceptanceDateTime', 'act', 'fileNumber', 'filmNumber', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocu

In [5]:
dataset["sicDescription"][5]

'Gold and Silver Ores'

# Curating the dataset
## Getting only the important forms

In [6]:
dataset = dataset.filter(lambda x: x["form"] in ["10-K","10-Q","8-K"])

In [7]:
print(f"Now we have {len(dataset)} datapoints")

Now we have 1784393 datapoints


In [8]:

# ^ Lets count the number of 10-Q, 10-K and 8-K forms we got
FORMS = ["10-Q", "10-K", "8-K"]
counter = Counter(dataset["form"])

for f in FORMS:
    print(f"{f}: {counter[f]}")



10-Q: 331958
10-K: 115277
8-K: 1337158


In [9]:
# ? Lets only get the recent years (last 10 years)
# ? Lets exclude the amendment forms
final_dataset = dataset.filter(
    lambda x: not x["form"].endswith("/A")
    and x["form"] != "8-K"
)
print(f"The dataset now contains {len(final_dataset)} datapoints")

The dataset now contains 447235 datapoints


Lets consider how many datapoints there are in each year

In [None]:
import matplotlib as plt
%matplotlib inline

## Lets filter out the datapoints which do not have any txt files in their json url

In [10]:
from classes.contentClass import Content
dataset_obj = []
for datapoint in tqdm(final_dataset):
    dataset_obj.append(Content(datapoint))

print(len(dataset_obj))

  0%|          | 865/447235 [13:57<120:03:38,  1.03it/s]


KeyboardInterrupt: 

## Getting text from the pdfs
Since this dataset does not contain descriptions and the scraped content of the documents itself, we will have to form a url for each datapoint using the cik and accession number. Once we get this url, which by the way points to the index.json file of SEC Edgar we can open PDFs on that website using this json file and then scrape the content of each pdf using a special python library.

In [None]:



content = content_class.Content(dataset[6])
print(Content.text_list)

[]


In [None]:
def get_text(object) -> str:
    """
    Build a text string from a single filing datapoint.

    This function concatenates the following fields (if present):
    - description
    - sicDescription
    - primaryDescription
    - key dates or other important fields

    The resulting text is later used for creating embeddings.

    :param datapoint: A single record from the dataset (dictionary-like object).
    :type datapoint: dict
    :return: Concatenated text string for the datapoint.
    """
    text = ""
    
    text += f""


    
    

<class 'dict'>


In [None]:
import chromadb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
persist_dir = "filings_vectorstore"
ids = []
metadatas = []
fields = ["ticker","form", "filingData", "cik", "category", "website"]
 # ^ lets create the embeddings, metadatas, ids in batches
for i in tqdm(range(0, len(final_dataset), 1000)):
    batch = final_dataset[i:i + 1000]
    for j, row in enumerate(batch):
        ids.append(f"id: {i + j}")
        metadatas.append(
            {
                f: row.get(f, "")
            }
        )
        embeddings = model.encode(sentences = get_text(row), normalize_embeddings=True)