In [None]:
import pandas as pd
import json
import gcsfs
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
import os
import warnings

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/mnt/disks/data/diss_bucket_key.json"
# Turn off all warnings
warnings.filterwarnings('ignore')

In [None]:
# ==========================
# STEP 2: Set GCS Path
# ==========================
BUCKET_NAME = "diss_market_data"
FOLDER_PATH = "EXTRACTED_FILINGS/10-Q/"
GCS_PATH = f"{BUCKET_NAME}/{FOLDER_PATH}"

In [None]:
# ==========================
# STEP 3: Authenticate (if needed)
# ==========================
from google.colab import auth
auth.authenticate_user()

In [None]:
# ==========================
# STEP 4: Read Files from GCS
# ==========================
fs = gcsfs.GCSFileSystem(project='your-gcp-project-id')  # Replace with your GCP project ID

In [None]:
# List all JSON files in the 10-Q folder
file_list = fs.ls(GCS_PATH)
json_files = [file for file in file_list if file.endswith('.json')]

In [None]:
# ==========================
# STEP 5: Parallel File Reading Function
# ==========================
def load_json_file(file_path):
    try:
        with fs.open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Failed to load {file_path}: {e}")
        return None

In [None]:
data_rows = []

with ThreadPoolExecutor(max_workers=20) as executor:
    for result in tqdm(executor.map(load_json_file, json_files), total=len(json_files), desc="Reading 10-Q files"):
        if result is not None:
            data_rows.append(result)

df = pd.DataFrame(data_rows)

In [None]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df)} documents from 10-Q folder.")
len(df)

In [None]:
# Optional: Save to CSV
df.to_csv("10Q_combined_dataset.csv", index=False)

## 8-K

In [None]:
FOLDER_PATH_8k = "EXTRACTED_FILINGS/8-K/"
GCS_PATH_8k = f"{BUCKET_NAME}/{FOLDER_PATH_8k}"

In [None]:
# List all JSON files in the 10-Q folder
file_list_8k = fs.ls(GCS_PATH_8k)
json_files_8k = [file for file in file_list_8k if file.endswith('.json')]

In [None]:
data_rows_8k = []

with ThreadPoolExecutor(max_workers=10) as executor:
    for result in tqdm(executor.map(load_json_file, json_files_8k), total=len(json_files_8k), desc="Reading 8-K files"):
        if result is not None:
            data_rows_8k.append(result)

df_8k = pd.DataFrame(data_rows_8k)

In [None]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df_8k)} documents from 8-K folder.")
df_8k.head()

# Optional: Save to CSV
df_8k.to_csv("8K_combined_dataset.csv", index=False)

In [None]:
!gsutil -m cp '10Q_combined_dataset.csv' 'gs://diss_market_data/10Q_combined_dataset.csv'

In [None]:
!gsutil -m cp '8K_combined_dataset.csv' 'gs://diss_market_data/8K_combined_dataset.csv'

# 10-K

In [None]:
import pandas as pd
import json
import gcsfs
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
# ==========================
# STEP 2: Set GCS Path
# ==========================
BUCKET_NAME = "diss_market_data"
FOLDER_PATH = "EXTRACTED_FILINGS/10-K/"
GCS_PATH = f"{BUCKET_NAME}/{FOLDER_PATH}"

In [None]:
# ==========================
# STEP 4: Read Files from GCS
# ==========================
fs = gcsfs.GCSFileSystem(project='your-gcp-project-id')  # Replace with your GCP project ID

In [None]:
# List all JSON files in the 10-Q folder
file_list = fs.ls(GCS_PATH)
json_files = [file for file in file_list if file.endswith('.json')]

In [None]:
# ==========================
# STEP 5: Parallel File Reading Function
# ==========================
def load_json_file(file_path):
    try:
        with fs.open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Failed to load {file_path}: {e}")
        return None

In [None]:
data_rows = []

with ThreadPoolExecutor(max_workers=20) as executor:
    for result in tqdm(executor.map(load_json_file, json_files), total=len(json_files), desc="Reading 10-Q files"):
        if result is not None:
            data_rows.append(result)

df = pd.DataFrame(data_rows)

In [None]:
# ==========================
# STEP 6: Show & Save
# ==========================
print(f"\nLoaded {len(df)} documents from 10-Q folder.")
len(df)

In [None]:
# Optional: Save to CSV
df.to_csv("Data/10K_combined_dataset.csv", index=False)

In [None]:
load_df = pd.read_csv("Data/10K_combined_dataset.csv")

In [None]:
load_df.head()

In [None]:
load_df.keys()

In [None]:
!gsutil -m cp 'Data/10K_combined_dataset.csv' 'gs://diss_market_data/10K_combined_dataset.csv'