In [5]:
import pandas as pd
import json
import gcsfs
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
import os
import warnings

In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/mnt/disks/data/diss_bucket_key.json"
# Turn off all warnings
warnings.filterwarnings('ignore')

In [None]:
# ==========================
# STEP 2: Set GCS Path
# ==========================
BUCKET_NAME = "diss_market_data"
FOLDER_PATH = "EXTRACTED_FILINGS/10-Q/"
GCS_PATH = f"{BUCKET_NAME}/{FOLDER_PATH}"

In [None]:
# ==========================
# STEP 3: Authenticate (if needed)
# ==========================
from google.colab import auth
auth.authenticate_user()

In [None]:
# ==========================
# STEP 4: Read Files from GCS
# ==========================
fs = gcsfs.GCSFileSystem(project='your-gcp-project-id')  # Replace with your GCP project ID

In [None]:
# List all JSON files in the 10-Q folder
file_list = fs.ls(GCS_PATH)
json_files = [file for file in file_list if file.endswith('.json')]

In [None]:
# ==========================
# STEP 5: Parallel File Reading Function
# ==========================
def load_json_file(file_path):
    try:
        with fs.open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Failed to load {file_path}: {e}")
        return None

In [None]:
data_rows = []

with ThreadPoolExecutor(max_workers=20) as executor:
    for result in tqdm(executor.map(load_json_file, json_files), total=len(json_files), desc="Reading 10-Q files"):
        if result is not None:
            data_rows.append(result)

df = pd.DataFrame(data_rows)

Reading 10-Q files:   0%|          | 0/7235 [00:00<?, ?it/s]

In [None]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df)} documents from 10-Q folder.")
len(df)


Loaded 7235 documents from 10-Q folder.


7235

In [None]:
# Optional: Save to CSV
df.to_csv("10Q_combined_dataset.csv", index=False)

## 8-K

In [None]:
FOLDER_PATH_8k = "EXTRACTED_FILINGS/8-K/"
GCS_PATH_8k = f"{BUCKET_NAME}/{FOLDER_PATH_8k}"

In [None]:
# List all JSON files in the 10-Q folder
file_list_8k = fs.ls(GCS_PATH_8k)
json_files_8k = [file for file in file_list_8k if file.endswith('.json')]

In [None]:
data_rows_8k = []

with ThreadPoolExecutor(max_workers=10) as executor:
    for result in tqdm(executor.map(load_json_file, json_files_8k), total=len(json_files_8k), desc="Reading 8-K files"):
        if result is not None:
            data_rows_8k.append(result)

df_8k = pd.DataFrame(data_rows_8k)

Reading 8-K files:   0%|          | 0/39121 [00:00<?, ?it/s]

In [None]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df_8k)} documents from 8-K folder.")
df_8k.head()


Loaded 39121 documents from 8-K folder.


Unnamed: 0,cik,company,filing_type,filing_date,period_of_report,sic,state_of_inc,state_location,fiscal_year_end,filing_html_index,...,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01
0,1000045,NICHOLAS FINANCIAL INC,8-K,2007-01-29,2007-01-29,6153,FL,FL,331,https://www.sec.gov/Archives/edgar/data/100004...,...,,,,,,,,,,Item 9.01 Financial Statements and Exhibits\nE...
1,1000180,SANDISK CORP,8-K,2007-02-21,2007-02-15,3572,DE,CA,1231,https://www.sec.gov/Archives/edgar/data/100018...,...,,,,,,,,,,
2,1000180,SANDISK CORP,8-K,2007-01-30,2007-01-30,3572,DE,CA,1231,https://www.sec.gov/Archives/edgar/data/100018...,...,,,,,,,,,,Item 9.01 Financial Statements and Exhibits\n(...
3,1000209,MEDALLION FINANCIAL CORP,8-K,2007-01-17,2007-01-16,6199,DE,NY,1231,https://www.sec.gov/Archives/edgar/data/100020...,...,,,,,,,,,,ITEM 9.01. FINANCIAL STATEMENTS AND EXHIBITS.\...
4,1000209,MEDALLION FINANCIAL CORP,8-K,2007-03-19,2007-03-19,6199,DE,NY,1231,https://www.sec.gov/Archives/edgar/data/100020...,...,,,,,,,,,ITEM 8.01\nOTHER EVENTS\nThe Company has adopt...,ITEM 9.01\nFINANCIAL STATEMENTS AND EXHIBITS.\...


# Optional: Save to CSV
df_8k.to_csv("8K_combined_dataset.csv", index=False)

In [None]:
!gsutil -m cp '10Q_combined_dataset.csv' 'gs://diss_market_data/10Q_combined_dataset.csv'

Copying file://10Q_combined_dataset.csv [Content-Type=text/csv]...
/ [0/1 files][    0.0 B/  1.8 GiB]   0% Done                                    ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Resuming upload for file://10Q_combined_dataset.csv
-
Operation completed over 1 objects/1.8 GiB.                                      


In [None]:
!gsutil -m cp '8K_combined_dataset.csv' 'gs://diss_market_data/8K_combined_dataset.csv'

Copying file://8K_combined_dataset.csv [Content-Type=text/csv]...
| [1/1 files][108.6 MiB/108.6 MiB] 100% Done                                    
Operation completed over 1 objects/108.6 MiB.                                    


# 10-K

In [1]:
import pandas as pd
import json
import gcsfs
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor

In [7]:
# ==========================
# STEP 2: Set GCS Path
# ==========================
BUCKET_NAME = "diss_market_data"
FOLDER_PATH = "EXTRACTED_FILINGS/10-K/"
GCS_PATH = f"{BUCKET_NAME}/{FOLDER_PATH}"

In [8]:
# ==========================
# STEP 4: Read Files from GCS
# ==========================
fs = gcsfs.GCSFileSystem(project='your-gcp-project-id')  # Replace with your GCP project ID

In [9]:
# List all JSON files in the 10-Q folder
file_list = fs.ls(GCS_PATH)
json_files = [file for file in file_list if file.endswith('.json')]

In [10]:
# ==========================
# STEP 5: Parallel File Reading Function
# ==========================
def load_json_file(file_path):
    try:
        with fs.open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Failed to load {file_path}: {e}")
        return None

In [11]:
data_rows = []

with ThreadPoolExecutor(max_workers=20) as executor:
    for result in tqdm(executor.map(load_json_file, json_files), total=len(json_files), desc="Reading 10-Q files"):
        if result is not None:
            data_rows.append(result)

df = pd.DataFrame(data_rows)

Reading 10-Q files:   0%|          | 0/2341 [00:00<?, ?it/s]

In [12]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df)} documents from 10-Q folder.")
len(df)


Loaded 2341 documents from 10-Q folder.


2341

In [13]:
# Optional: Save to CSV
df.to_csv("Data/10K_combined_dataset.csv", index=False)

In [14]:
load_df = pd.read_csv("Data/10K_combined_dataset.csv")

In [15]:
load_df.head()

Unnamed: 0,cik,company,filing_type,filing_date,period_of_report,sic,state_of_inc,state_location,fiscal_year_end,filing_html_index,...,item_9A,item_9B,item_9C,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,1000697,WATERS CORP /DE/,10-K,2007-03-01,2006-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9a:\nControls and Procedures\n(a)\nEvalua...,Item 9b:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
1,1000697,WATERS CORP /DE/,10-K,2008-02-29,2007-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
2,1000697,WATERS CORP /DE/,10-K,2009-02-27,2008-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
3,1000697,WATERS CORP /DE/,10-K,2010-02-26,2009-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,"Item 15:\nExhibits, Financial Statement Schedu...",
4,1000697,WATERS CORP /DE/,10-K,2011-02-25,2010-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,"Item 15:\nExhibits, Financial Statement Schedu...",


In [17]:
load_df.keys()

Index(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report',
       'sic', 'state_of_inc', 'state_location', 'fiscal_year_end',
       'filing_html_index', 'htm_filing_link', 'complete_text_filing_link',
       'filename', 'item_1', 'item_1A', 'item_1B', 'item_1C', 'item_2',
       'item_3', 'item_4', 'item_5', 'item_6', 'item_7', 'item_7A', 'item_8',
       'item_9', 'item_9A', 'item_9B', 'item_9C', 'item_10', 'item_11',
       'item_12', 'item_13', 'item_14', 'item_15', 'item_16'],
      dtype='object')

In [16]:
!gsutil -m cp 'Data/10K_combined_dataset.csv' 'gs://diss_market_data/10K_combined_dataset.csv'

Copying file://Data/10K_combined_dataset.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

ResumableUploadAbortException: 403 Provided scope(s) are not authorized
CommandException: 1 file/object could not be transferred.
