In [1]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Settings ===
input_dir = Path(r"D:\vc-research\reese data\downloaded_batches")

output_csv = input_dir / "combined_extracted_data.csv"

# === Collect flattened results ===
records = []

# === Iterate through all JSONL files ===
for file in tqdm(input_dir.glob("*.jsonl"), desc="Processing JSONL files"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                row = json.loads(line)
                content_str = row['response']['body']['choices'][0]['message']['content']
                content_dict = json.loads(content_str)

                flat_record = {"custom_id": row.get("custom_id", "")}

                for k, v in content_dict.items():
                    if k == "preferred_stock_terms":
                        if isinstance(v, dict):
                            # Flatten preferred_stock_terms dict into prefixed keys
                            for sub_k, sub_v in v.items():
                                flat_record[f"preferred_stock_terms_{sub_k}"] = sub_v
                        else:
                            flat_record["preferred_stock_terms"] = v
                    elif not isinstance(v, dict):
                        flat_record[k] = v

                records.append(flat_record)

            except Exception as e:
                print(f"Error in {file.name}: {e}")

# === Save to CSV ===
df = pd.DataFrame(records)

df.to_csv(output_csv,encoding='utf-8', index=False)
print(f"Saved CSV to: {output_csv}")


Processing JSONL files: 80it [00:00, 121.99it/s]

Saved CSV to: D:\vc-research\reese data\downloaded_batches\combined_extracted_data.csv





In [2]:
import pandas as pd
import json
from pandas import json_normalize

def expand_all_preferred_stock_columns(df):
    # Step 1: Identify all matching columns
    target_cols = [col for col in df.columns if col.startswith("preferred_stock_terms_")]

    for col in target_cols:
        # Step 2: Create presence column
        presence_col = f"{col}_present"
        print(presence_col)
        df[presence_col] = df[col].apply(lambda x: bool(x) and str(x).strip().lower() != 'nan')

        # Step 3: Safely parse JSON strings
        def try_parse_json(x):
            try:
                return json.loads(x.replace("'", '"')) if isinstance(x, str) else x
            except (json.JSONDecodeError, TypeError):
                return {}

        parsed_series = df.loc[df[presence_col], col].apply(try_parse_json)
        parsed_dicts = parsed_series.tolist()

        # Step 4: Flatten and normalize
        flat_df = json_normalize(parsed_dicts)
        flat_df.columns = [f"{col}_{c}" for c in flat_df.columns]
        flat_df.index = parsed_series.index

        # Step 5: Join back to original DataFrame
        df = pd.concat([df, flat_df], axis=1)

    return df


In [3]:

df = expand_all_preferred_stock_columns(df)

output_csv = input_dir / "combined_extracted_datafuck.csv"
df.to_csv(output_csv, index=False)
print(f"Saved CSV to: {output_csv}")


preferred_stock_terms_Series A Preferred Stock_present
preferred_stock_terms_Series B Preferred Stock_present
preferred_stock_terms_Series C Preferred Stock_present
preferred_stock_terms_Series A Preferred_present
preferred_stock_terms_Series A-1 Convertible Preferred Stock_present
preferred_stock_terms_Series C-1 Convertible Preferred Stock_present
preferred_stock_terms_Series D-1 Convertible Preferred Stock_present
preferred_stock_terms_Series D-2 Convertible Preferred Stock_present
preferred_stock_terms_Series A_present
preferred_stock_terms_Series B_present
preferred_stock_terms_Series C_present
preferred_stock_terms_Series D_present
preferred_stock_terms_Series X Convertible Preferred Stock_present
preferred_stock_terms_Series AA Convertible Preferred Stock_present
preferred_stock_terms_Series B Preferred_present
preferred_stock_terms_Series C Preferred_present
preferred_stock_terms_Series D Preferred_present
preferred_stock_terms_Series 1 Preferred_present
preferred_stock_terms_S