Notebook Step 3 - Save to JSON the use cases

In [11]:
import pandas as pd
import json
import re
import os

file_path = os.path.join("..", "datasets/Use Cases/3. All GPT Formatted Final Use Cases.xlsx")
df = pd.read_excel(file_path)


# Column indices
company_name_col = df.columns[0]  # Column A
use_case_text_col = df.columns[4]  # Column E

# Initialize containers
companies = []
skipped_empty = []
skipped_no_match = []

# Iterate over rows
for idx, row in df.iterrows():
    excel_row = idx + 2  # Account for header
    company_name = str(row[company_name_col]).strip()
    use_case_text = row[use_case_text_col]

    # Skip if use case cell is empty or NaN
    if pd.isna(use_case_text) or not str(use_case_text).strip():
        skipped_empty.append((excel_row, company_name))
        continue

    # Extract use cases
    use_cases = re.findall(
        r"AI Use Case:\s*(.*?)\s*Use Case Description:\s*(.*?)(?=AI Use Case:|$)",
        str(use_case_text),
        re.DOTALL
    )

    if not use_cases:
        skipped_no_match.append((excel_row, company_name))
        continue

    # Build structured company data
    company_data = {
        "company_name": company_name,
        "use_cases": [
            {
                "use_case_name": name.strip(),
                "use_case_description": description.strip()
            }
            for name, description in use_cases
        ]
    }

    companies.append(company_data)

# Save to JSON
with open("use_cases.json", "w", encoding="utf-8") as f:
    json.dump({"companies": companies}, f, indent=4, ensure_ascii=False)

print(f"\nJSON file 'use_cases.json' created successfully.")
print(f"Extracted use cases for {len(companies)} companies.")
print(f"Skipped {len(skipped_empty)} rows due to empty use case column.")
print(f"Skipped {len(skipped_no_match)} rows due to unrecognized formatting.\n")

if skipped_empty or skipped_no_match:
    print("Skipped Rows:")
    for row_num, company in skipped_empty:
        print(f"Row {row_num} (EMPTY) - {company}")
    for row_num, company in skipped_no_match:
        print(f"Row {row_num} (FORMAT MISMATCH) - {company}")


JSON file 'use_cases.json' created successfully.
Extracted use cases for 643 companies.
Skipped 0 rows due to empty use case column.
Skipped 0 rows due to unrecognized formatting.

