In [1]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Settings ===
input_dir = Path(r"D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches")

output_csv = input_dir / "combined_extracted_data.csv"

# === Collect flattened results ===
records = []

# === Iterate through all JSONL files ===
for file in tqdm(input_dir.glob("*.jsonl"), desc="Processing JSONL files"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                row = json.loads(line)
                content_str = row['response']['body']['choices'][0]['message']['content']
                content_dict = json.loads(content_str)

                flat_record = {"custom_id": row.get("custom_id", "")}

                for k, v in content_dict.items():
                    if k == "preferred_stock_terms":
                        if isinstance(v, dict):
                            # Flatten preferred_stock_terms dict into prefixed keys
                            for sub_k, sub_v in v.items():
                                # Clean sub_k: strip hyphens, remove "stock" or "preferred"
                                clean_sub_k = (
                                    sub_k.replace("-", "")  # remove hyphens
                                         .replace("Stock", "")  # remove Stock (case-sensitive)
                                         .replace("Preferred", "")
                                         .replace("Convertible","")# remove Preferred (case-sensitive)
                                         .strip()  # remove trailing/leading spaces
                                )
                                flat_record[f"preferred_stock_terms_{clean_sub_k}"] = sub_v
                                
                        else:
                            flat_record["preferred_stock_terms"] = v
                    elif not isinstance(v, dict):
                        flat_record[k] = v

                records.append(flat_record)

            except Exception as e:
                print(f"Error in {file.name}: {e}")

# === Save to CSV ===
df = pd.DataFrame(records)

df.to_csv(output_csv, encoding='utf-8', index=False)
print(f"Saved CSV to: {output_csv}")



Processing JSONL files: 1147it [00:15, 69.55it/s]

Error in batch_430_G3_out.jsonl: Unterminated string starting at: line 51 column 38 (char 3529)


Processing JSONL files: 7853it [01:47, 73.21it/s]


Saved CSV to: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\combined_extracted_data.csv


In [4]:
import json
import pandas as pd
from pandas import json_normalize

def expand_all_preferred_stock_columns(df):
    target_cols = [col for col in df.columns if col.startswith("preferred_stock_terms_")]

    def try_parse_json(x):
        if isinstance(x, dict):
            return x
        if isinstance(x, str):
            try:
                return json.loads(x.replace("'", '"'))
            except (json.JSONDecodeError, TypeError):
                return {}
        return {}

    for col in target_cols:
        presence_col = f"{col}_present"
        print(presence_col)

        df[presence_col] = df[col].apply(
            lambda x: bool(x) and str(x).strip().lower() != "nan"
        )

        parsed_series = df.loc[df[presence_col], col].apply(try_parse_json)
        parsed_dicts = parsed_series.tolist()

        # if nothing valid, skip
        if not parsed_dicts:
            continue

        flat_df = json_normalize(parsed_dicts)
        flat_df.columns = [f"{col}_{c}" for c in flat_df.columns]
        flat_df.index = parsed_series.index

        df = pd.concat([df, flat_df], axis=1)

    return df


In [5]:

df = expand_all_preferred_stock_columns(df)

output_csv = input_dir / "combined_extracted_dataAICoded.csv"
df.to_csv(output_csv, index=False)
print(f"Saved CSV to: {output_csv}")


preferred_stock_terms_Series 1A_present
preferred_stock_terms_Series 1B_present
preferred_stock_terms_Series 2A_present
preferred_stock_terms_Series 2B_present
preferred_stock_terms_Series 3_present
preferred_stock_terms_Series 4_present
preferred_stock_terms_Series A_present
preferred_stock_terms_Series B_present
preferred_stock_terms_Series C_present
preferred_stock_terms_Series 1_present
preferred_stock_terms_Series 2_present
preferred_stock_terms_Series D_present
preferred_stock_terms_Series A1_present
preferred_stock_terms_Series A2_present
preferred_stock_terms_Series A3_present
preferred_stock_terms_Series C1_present
preferred_stock_terms_Series AA_present
preferred_stock_terms_Series A1A_present
preferred_stock_terms_Series 1 Senior_present
preferred_stock_terms_Series B2_present
preferred_stock_terms_Series B1_present
preferred_stock_terms_Series B3_present
preferred_stock_terms_Series CPrime_present
preferred_stock_terms_Series AA1_present
preferred_stock_terms_Series AA2_pre