#### Data Pre-processing

Load raw trial data

In [14]:
import pandas as pd 

data = pd.read_csv("data/raw_trials.csv")

In [15]:
print(data.columns)

Index(['title', 'objective', 'outcome_details', 'phase',
       'primary_completion_date', 'primary_endpoints_reported_date',
       'prior_concurrent_therapy', 'start_date', 'study_design',
       'treatment_plan', 'record_type', 'patients_per_site_per_month',
       'primary_endpoint_json', 'other_endpoint_json', 'associated_cro_json',
       'notes_json', 'outcomes_json', 'patient_dispositions_json',
       'results_json', 'study_keywords_json', 'tags_json',
       'primary_drugs_tested_json', 'other_drugs_tested_json',
       'therapeutic_areas_json', 'bmt_other_drugs_tested_json',
       'bmt_primary_drugs_tested_json', 'ct_gov_listed_locations_json',
       'ct_gov_mesh_terms_json'],
      dtype='object')


In [10]:
print(data.isna().sum().to_markdown())
print("Shape:", data.shape)

|                                 |   0 |
|:--------------------------------|----:|
| title                           |   0 |
| objective                       |   3 |
| outcome_details                 | 146 |
| phase                           |   0 |
| primary_completion_date         |  61 |
| primary_endpoints_reported_date | 161 |
| prior_concurrent_therapy        | 184 |
| start_date                      |  45 |
| study_design                    |  16 |
| treatment_plan                  |   1 |
| record_type                     |   0 |
| patients_per_site_per_month     | 119 |
| primary_endpoint_json           |   0 |
| other_endpoint_json             |   0 |
| associated_cro_json             |   0 |
| notes_json                      |   0 |
| outcomes_json                   |   0 |
| patient_dispositions_json       |   0 |
| results_json                    |   0 |
| study_keywords_json             |   0 |
| tags_json                       |   0 |
| primary_drugs_tested_json       

Generate unique hash per trial since trial id is missing
- i.e. "tid_0e8fa21079f928135dfc6164a15285f8"

In [11]:
import hashlib
import json
from pathlib import Path

OUTPUT_PATH = Path("cache/raw_trials_with_hash.csv")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------
# If file already exists → skip generation
# ---------------------------------------------------------
if OUTPUT_PATH.exists():
    print(f"⚠️ {OUTPUT_PATH} already exists — skipping hash generation.")
else:
    print("Generating raw_trials_with_hash.csv ...")

    def make_trial_hash(row):
        """
        Deterministic hash for a trial based on stable fields.
        You can add/remove fields if needed.
        """
        payload = {
            "title": row.get("title", ""),
            "start_date": row.get("start_date", ""),
            "phase": row.get("phase", ""),
        }
        raw = json.dumps(payload, sort_keys=True, ensure_ascii=False)
        return "tid_" + hashlib.md5(raw.encode("utf-8")).hexdigest()

    # Create trial_hash column
    data["trial_hash"] = data.apply(make_trial_hash, axis=1)

    # Move trial_hash to first column
    cols = ["trial_hash"] + [c for c in data.columns if c != "trial_hash"]
    data = data[cols]

    print(data.columns)
    print(data.shape)

    # Export
    data.to_csv(OUTPUT_PATH, index=False)
    print(f"✅ Saved to {OUTPUT_PATH}")

⚠️ cache/raw_trials_with_hash.csv already exists — skipping hash generation.


#### Task 1

Using a chatbot, identify all interventions from each trial. For each intervention...
- label as the investigational product, active comparator, or placebo
- list all of the alternative names
- identify the molecular target 
- identify the mechanism of action

In [16]:
import re
import json
import time
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from services.openai_wrapper import OpenAIWrapper

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
BASE_DIR = Path("cache")

TRIALS_WITH_HASH_CSV = Path("cache/raw_trials_with_hash.csv")

DRUG_ROLE_DIR = BASE_DIR / "trial_drug_roles"
DRUG_ROLE_DIR.mkdir(parents=True, exist_ok=True)

DRUG_ROLE_LOG_DIR = BASE_DIR / "trial_drug_roles_log"
DRUG_ROLE_LOG_DIR.mkdir(parents=True, exist_ok=True)

MASTER_ROLES_PATH = BASE_DIR / "trial_drug_roles_master.json"

MODEL = "gpt-5"
client = OpenAIWrapper()

MAX_WORKERS = 8

# Columns to feed into the chatbot
RELEVANT_COLS = [
    "title",
    "objective",
    "outcome_details",
    "notes_json",
    "results_json",
    "primary_drugs_tested_json",
    "other_drugs_tested_json",
    "therapeutic_areas_json",
    "bmt_other_drugs_tested_json",
    "bmt_primary_drugs_tested_json",
    "ct_gov_mesh_terms_json",
]

# -------------------------------------------------
# Helpers
# -------------------------------------------------
def extract_json_object(text: str) -> dict:
    """Extract first valid JSON object from model output."""
    if not isinstance(text, str):
        return {}
    text = text.strip()
    if not text:
        return {}

    # Direct parse first
    try:
        obj = json.loads(text)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass

    # Fallback: first {...} region
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {}
    try:
        obj = json.loads(m.group(0))
        if isinstance(obj, dict):
            return obj
    except Exception:
        return {}

    return {}


def build_prompt(trial_payload: dict) -> str:
    """
    Build prompt asking the LLM to:
    - Extract drug names
    - Canonicalize names by removing company/manufacturer/location qualifiers
    - Deduplicate synonymous names
    - For each canonical drug, return a dict with:
        * role (Investigational Product / Placebo / Active Comparator / Standard of Care)
        * alternative_names (list)
        * molecular_target
        * mechanism
    """
    payload_json = json.dumps(trial_payload, ensure_ascii=False, indent=2)

    return f"""
You are a clinical trial design and interpretation expert.

You are given structured information about a clinical trial, including:
- Title and objective
- Study design and treatment plan
- JSON fields listing drugs tested in the study:
  - primary_drugs_tested_json
  - other_drugs_tested_json
  - bmt_other_drugs_tested_json
  - bmt_primary_drugs_tested_json
- These JSON fields may also contain metadata such as
  drugApprovalStatus (Approved / Unapproved), mechanisms, etc.

Your tasks:

1. Identify all DISTINCT physical drug entities explicitly used in the study.
   - Strings in the *_drugs_tested_json fields are drug-name candidates.
   - If these fields contain structured JSON, infer names from keys such as
     "name", "drug_name", "preferred_name", "label", etc.

2. Canonicalize each drug name:
   Remove company names, manufacturer qualifiers, geographic qualifiers,
   dosage-form qualifiers, or parenthetical descriptors that do NOT change
   the name of the underlying drug.
   Examples of correct canonicalization:
   - "AlphaBlocker (CompanyX)" → "AlphaBlocker"
   - "Recombinant Growth Factor (rgf)" → "Recombinant Growth Factor"
   - "DrugX citrate (RegionY)" → "DrugX citrate"
   - "BrandName (compound-42, MakerCorp)" → "BrandName"

   Keep only the essential drug or brand name as the canonical key.

3. Deduplicate synonymous names referring to the SAME drug.
   - If multiple variations refer to one physical drug, keep ONE canonical key.
   - Prefer the simplest, clean name.
   - Collect all other variations in alternative_names.

4. For EACH distinct drug, build an object with FOUR fields:

   - "role": one of:
       * "Investigational Product"
       * "Placebo"
       * "Active Comparator"
       * "Standard of Care"

   ROLE ASSIGNMENT GUIDANCE:

   A. "Investigational Product"
      - Use ONLY for the sponsor's proprietary or novel product.
      - Clues: unapproved, new mechanism, highlighted in title/objective.
      - Do NOT label common chemotherapy or widely used drugs this way.

   B. "Standard of Care"
      - Use for established backbone therapies, such as common chemotherapies
        or widely used drugs in the disease area.
      - Examples (fictional): DrugX, Chemo-A, Cytotoxin-7, etc.

   C. "Active Comparator"
      - Use when a non-placebo drug is explicitly the control arm.
      - Clues: terms like "versus", "comparator", "control regimen".

   D. "Placebo"
      - Use for inert or sham treatments.

   SUMMARY:
   - Proprietary or novel study drug → "Investigational Product".
   - Classical or widely used therapy → "Standard of Care".
   - Control regimen (non-placebo) → "Active Comparator".
   - Inert control → "Placebo".

   - "alternative_names": list of synonymous or variant names.
     Examples:
     * ABC-123 → ["Compound-ABC", "ABC123"]
     * BrandX → ["generic compound name"]

   - "molecular_target": e.g., "CD20", "Kinase-A", "Receptor-Z".
     If unknown, use "".

   - "mechanism": e.g., "monoclonal antibody", "kinase inhibitor",
     "fusion protein", etc.
     If not inferable, use "".

Important rules:
- "role" MUST use only the allowed strings.
- No invented drugs.
- Combination therapies: classify EACH component using the rules above.

Input JSON:
{payload_json}

Output format (IMPORTANT):
- Return ONLY a valid JSON object with:
    - keys   = canonical drug names
    - values = objects with EXACTLY:
        * "role"
        * "alternative_names"
        * "molecular_target"
        * "mechanism"

Example output:
{{
  "ABC-123": {{
    "role": "Investigational Product",
    "alternative_names": ["ABC123", "Compound-ABC"],
    "molecular_target": "Receptor-Z",
    "mechanism": "Bispecific antibody"
  }},
  "DrugX": {{
    "role": "Standard of Care",
    "alternative_names": ["GenericX", "ChemX"],
    "molecular_target": "Enzyme-A",
    "mechanism": "Antimetabolite"
  }},
  "ControlDrug": {{
    "role": "Active Comparator",
    "alternative_names": ["CD-01"],
    "molecular_target": "",
    "mechanism": ""
  }},
  "Placebo": {{
    "role": "Placebo",
    "alternative_names": [],
    "molecular_target": "",
    "mechanism": "Inert comparator"
  }}
}}

Before returning JSON:
- Ensure no drug key contains manufacturer qualifiers.
- Ensure all four fields exist for every drug.
- Ensure classical backbone therapies are NOT labeled “Investigational Product”.
""".strip()


# Shared counters & master mapping
counter = {
    "processed": 0,
    "skipped_existing": 0,
    "llm_error": 0,
    "parse_error": 0,
}
counter_lock = threading.Lock()

master_roles: dict[str, dict] = {}
master_lock = threading.Lock()


def process_trial(row: dict, idx: int, total: int) -> None:
    """Process one trial: prompt LLM, save output & log (only if valid)."""
    trial_hash = str(row.get("trial_hash", "")).strip()
    if not trial_hash:
        print(f"⚠️ [{idx}/{total}] Missing trial_hash, skipping")
        return

    out_fp = DRUG_ROLE_DIR / f"{trial_hash}.json"
    if out_fp.exists():
        with counter_lock:
            counter["skipped_existing"] += 1
        return

    # Build payload from selected columns
    trial_payload = {"trial_hash": trial_hash}
    for col in RELEVANT_COLS:
        trial_payload[col] = row.get(col, "")

    prompt = build_prompt(trial_payload)

    token = trial_hash
    hash_id = trial_hash

    text_response = ""
    raw_response = None
    total_cost = 0.0
    elapsed = 0.0

    # Call LLM
    try:
        t0 = time.perf_counter()
        res = client.query(prompt=prompt, model=MODEL)
        elapsed = round(time.perf_counter() - t0, 2)

        text_response = (res.get("text_response") or "").strip()
        raw_response = res.get("raw_response")
        total_cost = float(res.get("cost") or 0.0)

    except Exception as e:
        print(f"⚠️ [{idx}/{total}] LLM error for trial_hash={trial_hash}: {e}")
        with counter_lock:
            counter["llm_error"] += 1
        return

    drug_roles = extract_json_object(text_response)

    # Treat non-dict OR empty dict as invalid → do NOT save anything
    if not isinstance(drug_roles, dict) or not drug_roles:
        print(f"⚠️ [{idx}/{total}] JSON parse/validity error trial_hash={trial_hash}, raw={text_response!r}")
        with counter_lock:
            counter["parse_error"] += 1
        return

    mapped = {
        "trial_hash": trial_hash,
        "title": row.get("title"),
        "drug_roles": drug_roles,
        "source": "llm",
    }

    # Save per-trial roles JSON
    out_fp.write_text(json.dumps(mapped, ensure_ascii=False, indent=2), encoding="utf-8")

    # Log entry
    log_payload = {
        "token": token,
        "hash_id": hash_id,
        "model": MODEL,
        "prompt": prompt,
        "structured_response": json.dumps(mapped, ensure_ascii=False, indent=2),
        "raw_response": repr(raw_response),
        "total_cost": total_cost,
        "time_elapsed": elapsed,
    }
    (DRUG_ROLE_LOG_DIR / f"{hash_id}.json").write_text(
        json.dumps(log_payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # Update master roles
    with master_lock:
        master_roles[trial_hash] = mapped
        MASTER_ROLES_PATH.write_text(
            json.dumps(master_roles, ensure_ascii=False, indent=2),
            encoding="utf-8"
        )

    with counter_lock:
        counter["processed"] += 1
        if counter["processed"] % 50 == 0:
            print(f"Progress: processed {counter['processed']} trials...")


# -------------------------------------------------
# RUN CONCURRENTLY
# -------------------------------------------------
df_trials = pd.read_csv(TRIALS_WITH_HASH_CSV, dtype=str).fillna("")
rows = df_trials.to_dict(orient="records")
total_trials = len(rows)
print(f"Loaded {total_trials} trials from {TRIALS_WITH_HASH_CSV}")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {
        ex.submit(process_trial, row, idx, total_trials): row.get("trial_hash")
        for idx, row in enumerate(rows, start=1)
    }
    for fut in as_completed(futures):
        th = futures[fut]
        try:
            fut.result()
        except Exception as e:
                print(f"⚠️ Worker error trial_hash={th}: {e}")

print(
    f"✅ Trial drug-role mapping complete. "
    f"processed={counter['processed']}, "
    f"skipped={counter['skipped_existing']}, "
    f"llm_error={counter['llm_error']}, "
    f"parse_error={counter['parse_error']}"
)
print(f"Roles directory: {DRUG_ROLE_DIR}")
print(f"Log directory:   {DRUG_ROLE_LOG_DIR}")
print(f"Master roles:    {MASTER_ROLES_PATH}")

Loaded 184 trials from cache/raw_trials_with_hash.csv
Progress: processed 50 trials...
Progress: processed 100 trials...
Progress: processed 150 trials...
✅ Trial drug-role mapping complete. processed=184, skipped=0, llm_error=0, parse_error=0
Roles directory: cache/trial_drug_roles
Log directory:   cache/trial_drug_roles_log
Master roles:    cache/trial_drug_roles_master.json


In [19]:
import json
from pathlib import Path

LOG_DIR = Path("cache/trial_drug_roles_log")

total_cost = 0.0
num_entries = 0
costs = []

for fp in LOG_DIR.glob("*.json"):
    try:
        log = json.loads(fp.read_text(encoding="utf-8"))
        c = float(log.get("total_cost") or 0.0)
        total_cost += c
        costs.append((fp.name, c))
        num_entries += 1
    except Exception as e:
        print(f"⚠️ Error reading {fp.name}: {e}")

# Sort descending by cost
costs_sorted = sorted(costs, key=lambda x: x[1], reverse=True)

print("========== LLM COST SUMMARY ==========")
print(f"Total LLM cost:             ${total_cost:,.4f}")
print(f"Number of logged trials:     {num_entries}")
if num_entries > 0:
    print(f"Average cost per trial:      ${total_cost / num_entries:,.4f}")
print("")

print("Top 10 most expensive trials:")
for name, c in costs_sorted[:10]:
    print(f"  {name}: ${c:,.4f}")

print("========================================")

Total LLM cost:             $3.8619
Number of logged trials:     184
Average cost per trial:      $0.0210

Top 10 most expensive trials:
  tid_261f0233308ca080d1c60e3fda61ca85.json: $0.0621
  tid_1158b3369546dc4b16dc21c8c026b619.json: $0.0454
  tid_8b4d60a5fddc078962af34399d7e342c.json: $0.0425
  tid_e0a77c4ecf93cf781f04cc467c974511.json: $0.0423
  tid_94883aa2d583afced004e22a7991ef3e.json: $0.0416
  tid_86fc91efde85f9a7a9e0f9786fc67404.json: $0.0410
  tid_a50324f4d36f5cc93b795ec7f8b7005b.json: $0.0407
  tid_43635104c2d64be16c8882a500dd5181.json: $0.0401
  tid_837737698a5271d314ea8208addb2d72.json: $0.0397
  tid_763e3011bc90e46c88c7a2953a39ed2a.json: $0.0390


In [20]:
import json

import pandas as pd

# -------------------------------------------------
# Build trial_product_breakdown.csv
# -------------------------------------------------
OUT_CSV = BASE_DIR / "trial_product_breakdown.csv"

rows = []

for fp in DRUG_ROLE_DIR.glob("*.json"):
    try:
        obj = json.loads(fp.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"⚠️ Error reading {fp.name}: {e}")
        continue

    trial_hash = obj.get("trial_hash")
    if not trial_hash:
        print(f"⚠️ Missing trial_hash in {fp.name}, skipping")
        continue

    drug_roles = obj.get("drug_roles") or {}
    if not isinstance(drug_roles, dict):
        print(f"⚠️ drug_roles not dict in {fp.name}, skipping")
        continue

    # Containers
    inv_names = []
    inv_alt_names = []          # list of lists
    inv_targets = []
    inv_mechanisms = []

    ac_names = []
    ac_alt_names = []           # list of lists
    ac_targets = []
    ac_mechanisms = []

    plc_names = []
    plc_alt_names = []          # list of lists
    plc_targets = []
    plc_mechanisms = []

    soc_names = []
    soc_alt_names = []          # list of lists
    soc_targets = []
    soc_mechanisms = []

    for drug_name, meta in drug_roles.items():
        if not isinstance(meta, dict):
            continue

        role = (meta.get("role") or "").strip()
        role_norm = role.lower()

        alt_names = meta.get("alternative_names") or []
        if not isinstance(alt_names, list):
            alt_names = [str(alt_names)]

        molecular_target = meta.get("molecular_target") or ""
        mechanism = meta.get("mechanism") or ""

        if role_norm == "investigational product":
            inv_names.append(drug_name)
            inv_alt_names.append(alt_names)
            inv_targets.append(molecular_target)
            inv_mechanisms.append(mechanism)
        elif role_norm == "active comparator":
            ac_names.append(drug_name)
            ac_alt_names.append(alt_names)
            ac_targets.append(molecular_target)
            ac_mechanisms.append(mechanism)
        elif role_norm == "placebo":
            plc_names.append(drug_name)
            plc_alt_names.append(alt_names)
            plc_targets.append(molecular_target)
            plc_mechanisms.append(mechanism)
        elif role_norm == "standard of care":
            soc_names.append(drug_name)
            soc_alt_names.append(alt_names)
            soc_targets.append(molecular_target)
            soc_mechanisms.append(mechanism)

    rows.append(
        {
            "trial_hash": trial_hash,

            "investigational_products": inv_names,
            "investigational_products_alternative_names": inv_alt_names,
            "investigational_products_molecular_target": inv_targets,
            "investigational_products_mechanism": inv_mechanisms,

            "active_comparators": ac_names,
            "active_comparators_alternative_names": ac_alt_names,
            "active_comparators_molecular_target": ac_targets,
            "active_comparators_mechanism": ac_mechanisms,

            "placebos": plc_names,
            "placebos_alternative_names": plc_alt_names,
            "placebos_molecular_target": plc_targets,
            "placebos_mechanism": plc_mechanisms,

            "standard_of_care": soc_names,
            "standard_of_care_alternative_names": soc_alt_names,
            "standard_of_care_molecular_target": soc_targets,
            "standard_of_care_mechanism": soc_mechanisms,
        }
    )

df_out = pd.DataFrame(rows).sort_values("trial_hash")

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(OUT_CSV, index=False)

print(f"Saved trial product breakdown to {OUT_CSV}")
print(df_out.head().to_markdown())

Saved trial product breakdown to cache/trial_product_breakdown.csv
|     | trial_hash                           | investigational_products                      | investigational_products_alternative_names                                                                                                                                                                                | investigational_products_molecular_target   | investigational_products_mechanism                                               | active_comparators   | active_comparators_alternative_names                                                                                                                                            | active_comparators_molecular_target   | active_comparators_mechanism                                 | placebos   | placebos_alternative_names   | placebos_molecular_target   | placebos_mechanism   | standard_of_care   | standard_of_care_alternative_names   | standard_of_care_molecular_t

Manually check the rows with no investigational products

In [22]:
import ast
import pandas as pd

IN_CSV = BASE_DIR / "trial_product_breakdown.csv"

df = pd.read_csv(IN_CSV, dtype=str).fillna("")

def parse_listish(s: str):
    """
    Parse a stringified list like "['A', 'B']" into a Python list.
    If parsing fails or the cell is empty, return [].
    """
    if not isinstance(s, str):
        return []
    s = s.strip()
    if not s:
        return []
    # Common empty-list cases
    if s in ("[]", "[ ]"):
        return []
    try:
        val = ast.literal_eval(s)
        if isinstance(val, list):
            return val
        # If it's something else, treat as a single non-empty token
        return [val]
    except Exception:
        # Fallback: treat non-empty string as a single element
        return [s]

# Parse the investigational_products column into real lists
df["investigational_products_parsed"] = df["investigational_products"].apply(parse_listish)

# Flag rows with no investigational products
no_inv_mask = df["investigational_products_parsed"].apply(lambda x: len(x) == 0)

num_no_inv = int(no_inv_mask.sum())
total = len(df)

print(f"Rows with NO investigational products: {num_no_inv} / {total}")

# Show a few examples
print(
    df.loc[no_inv_mask, ["trial_hash", "investigational_products"]]
      .head(20)
      .to_markdown(index=False)
)

Rows with NO investigational products: 5 / 184
| trial_hash                           | investigational_products   |
|:-------------------------------------|:---------------------------|
| tid_4c45730f6411aa1e5a38bb1223d66988 | []                         |
| tid_67de51bf9728e056a6fb42c76e4b0212 | []                         |
| tid_8cab7b7177fcb0d10255bced8b0633ee | []                         |
| tid_bb1e0571142dde8a49976632c349593c | []                         |
| tid_ed66bb2727de7173d74abf4af19f70e8 | []                         |


Manual checks 
- tid_4c45730f6411aa1e5a38bb1223d66988
    - This trial is combining three standard-of-care agents into a regimen “DCF”
- tid_67de51bf9728e056a6fb42c76e4b0212
    - Even though they administer Yisaipu in a structured way, it is an approved drug and not being tested for regulatory approval.
- tid_8cab7b7177fcb0d10255bced8b0633ee
    - The trial is studying treatment strategies, regimens, algorithms, imaging-guided regimen selection, or dosing, using only approved standard therapies.
- tid_bb1e0571142dde8a49976632c349593c
    - The trial's focus is on optimizing regimen selection (e.g., TIPy or TCbIPy) via imaging, rather than testing a new drug entity.
- tid_ed66bb2727de7173d74abf4af19f70e8
    - evaluates the safety and efficacy of 3SBio's EPIAO (a biosimilar epoetin alfa/recombinant human erythropoietin) for cancer-related anemia

these are all confirmed generics biosimilars

#### Task 2

Identify whether the drugs are innovative or/generic biosimilars

In [5]:
import json
import time
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from services.openai_wrapper import OpenAIWrapper
import ast

import pandas as pd

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
BASE_DIR = Path("cache")

TRIALS_WITH_HASH_CSV    = BASE_DIR / "raw_trials_with_hash.csv"
PRODUCT_BREAKDOWN_CSV   = BASE_DIR / "trial_product_breakdown.csv"

INNOV_DIR = BASE_DIR / "trial_investigational_drugs_classifications"
INNOV_DIR.mkdir(parents=True, exist_ok=True)

INNOV_LOG_DIR = BASE_DIR / "trial_investigational_drugs_classifications_log"
INNOV_LOG_DIR.mkdir(parents=True, exist_ok=True)

MASTER_INNOV_PATH = BASE_DIR / "trial_investigational_drugs_classifications_master.json"

RELEVANT_COLS = [
    "title",
    "objective",
    "outcome_details",
    "notes_json",
    "results_json",
    "primary_drugs_tested_json",
    "other_drugs_tested_json",
    "therapeutic_areas_json",
    "bmt_other_drugs_tested_json",
    "bmt_primary_drugs_tested_json",
    "ct_gov_mesh_terms_json",
]

MAX_WORKERS_INNOV = 8

MODEL = "gpt-5"
client = OpenAIWrapper()

# -------------------------------------------------
# Helpers
# -------------------------------------------------
def load_master_innov() -> dict:
    if not MASTER_INNOV_PATH.exists():
        return {}
    try:
        return json.loads(MASTER_INNOV_PATH.read_text(encoding="utf-8"))
    except Exception:
        return {}

def extract_json_object(text: str) -> dict:
    """Extract first valid JSON object from model output."""
    if not isinstance(text, str):
        return {}
    text = text.strip()
    if not text:
        return {}

    # Direct parse first
    try:
        obj = json.loads(text)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass

    # Fallback: first {...} region
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {}
    try:
        obj = json.loads(m.group(0))
        if isinstance(obj, dict):
            return obj
    except Exception:
        return {}

    return {}

def parse_listish(s: str):
    """
    Parse a stringified list like "['A', 'B']" into a Python list.
    If parsing fails or the cell is empty, return [].
    """
    if not isinstance(s, str):
        return []
    s = s.strip()
    if not s:
        return []
    # Common empty-list cases
    if s in ("[]", "[ ]"):
        return []
    try:
        val = ast.literal_eval(s)
        if isinstance(val, list):
            return val
        # If it's something else, treat as a single non-empty token
        return [val]
    except Exception:
        # Fallback: treat non-empty string as a single element
        return [s]

master_innov = load_master_innov()
master_lock = threading.Lock()

innov_counter = {
    "processed": 0,
    "skipped_existing": 0,
    "llm_error": 0,
    "parse_error": 0,
    "coverage_error": 0,
}
counter_lock = threading.Lock()


def build_innovation_prompt(trial_payload: dict, investigational_products: list[str]) -> str:
    """
    Build prompt to classify each investigational product as
    Innovative / Generic / Biosimilar, with one-sentence explanation.
    """
    payload_json = json.dumps(trial_payload, ensure_ascii=False, indent=2)
    drugs_json   = json.dumps(investigational_products, ensure_ascii=False, indent=2)

    return f"""
You are a clinical trial design and drug development expert.

You are given:
1) Structured information about a clinical trial (title, objective, results, etc.).
2) A list of investigational products used in the trial.
3) Structured fields describing how drugs are classified in the study
   (investigational_products, active_comparators, placebos, standard_of_care, etc.).

Your task: For EACH investigational product, classify whether it is:
- "Innovative"
- "Generic"
- "Biosimilar"

and provide a one-sentence concise explanation for your classification.

DEFINITIONS / GUIDANCE
----------------------

Innovative:
- A novel or proprietary drug.
- New mechanism of action OR new biological entity OR clearly sponsor's lead product.
- Often associated with superiority or efficacy language:
  - "evaluate efficacy", "vs placebo", "improve outcomes", etc.
- Not a copy of an already-approved product.

Generic:
- A small-molecule copy of an already-approved branded drug.
- Same active ingredient, strength, dosage form, and route.
- Often associated with:
  - language like "generic", "copy", "equivalent",
  - OR clear indication that the product is a non-branded version.

Biosimilar:
- A biologic product that is highly similar to an already-approved reference biologic.
- Same target and mechanism as a branded biologic.
- Strong clues:
  - "equivalence", "non-inferiority", "no clinically meaningful differences",
  - direct comparison to a specific branded reference biologic with the SAME active ingredient.

Task 2 — Classify Innovation Status
-----------------------------------

Use clues within the text to determine whether each investigational drug is:
- "Innovative"
- "Generic"
- "Biosimilar"

Examples of helpful cues:
- Innovative:
  - Superiority/efficacy language ("versus placebo", "evaluate efficacy").
  - Novel or advanced mechanism, new target, or first-in-class description.
- Biosimilar:
  - Equivalence or non-inferiority language.
  - Direct comparison to a branded reference product with the same active ingredient.
- Generic:
  - Explicitly described as generic.
  - Non-biologic small-molecule copy of an existing branded product.

If the information is incomplete, choose the MOST LIKELY label based on the text and typical drug naming patterns.
You MUST still choose ONE of the three labels ("Innovative", "Generic", "Biosimilar") for each drug.
If you are uncertain, you may say so in the one-sentence explanation.

OUTPUT FORMAT (IMPORTANT)
-------------------------

Return ONLY a valid JSON object, with:
- KEYS   = exactly the investigational product names as provided in the list below
- VALUES = an object with exactly two fields:
    - "classification": one of "Innovative", "Generic", "Biosimilar"
    - "explanation": a single, concise sentence explaining your reasoning

You MUST provide a classification for EVERY investigational product name.

Example output:
{{
  "DrugA": {{
    "classification": "Innovative",
    "explanation": "DrugA is a novel monoclonal antibody targeting a new receptor and is the sponsor's lead product."
  }},
  "DrugB": {{
    "classification": "Biosimilar",
    "explanation": "DrugB is tested for non-inferiority compared to the branded biologic with the same target."
  }}
}}

TRIAL PAYLOAD (includes trial text and all drug-role breakdown columns):
{payload_json}

INVESTIGATIONAL PRODUCTS (you MUST classify EACH of these):
{drugs_json}
""".strip()


def process_innov_row(row: dict, idx: int, total: int, breakdown_cols: list[str]) -> None:
    """Process a single trial with investigational products."""
    trial_hash = str(row.get("trial_hash", "")).strip()
    if not trial_hash:
        print(f"⚠️ [{idx}/{total}] Missing trial_hash, skipping")
        return

    investigational_products = row.get("investigational_products_parsed") or []
    investigational_products = [str(x).strip() for x in investigational_products if str(x).strip()]

    if not investigational_products:
        # Shouldn't happen due to filtering, but be safe
        return

    out_fp = INNOV_DIR / f"{trial_hash}.json"
    if out_fp.exists():
        with counter_lock:
            innov_counter["skipped_existing"] += 1
        return

    # Build payload from selected columns
    trial_payload = {"trial_hash": trial_hash}

    # 1) Trial-level textual fields from raw_trials_with_hash.csv
    for col in RELEVANT_COLS:
        trial_payload[col] = row.get(col, "")

    # 2) ALL columns from trial_product_breakdown.csv
    for col in breakdown_cols:
        trial_payload[col] = row.get(col, "")

    prompt = build_innovation_prompt(trial_payload, investigational_products)

    token = trial_hash
    hash_id = trial_hash

    text_response = ""
    raw_response = None
    total_cost = 0.0
    elapsed = 0.0

    # Call LLM
    try:
        t0 = time.perf_counter()
        res = client.query(prompt=prompt, model=MODEL)
        elapsed = round(time.perf_counter() - t0, 2)

        text_response = (res.get("text_response") or "").strip()
        raw_response = res.get("raw_response")
        total_cost = float(res.get("cost") or 0.0)
    except Exception as e:
        print(f"⚠️ [{idx}/{total}] LLM error for trial_hash={trial_hash}: {e}")
        with counter_lock:
            innov_counter["llm_error"] += 1
        return

    # Parse JSON
    classifications = extract_json_object(text_response)

    if not isinstance(classifications, dict) or not classifications:
        print(f"⚠️ [{idx}/{total}] JSON parse error trial_hash={trial_hash}, raw={text_response!r}")
        with counter_lock:
            innov_counter["parse_error"] += 1
        return

    # Check coverage: every investigational product must be present as a key
    missing = [d for d in investigational_products if d not in classifications]
    if missing:
        print(
            f"⚠️ [{idx}/{total}] Coverage error for trial_hash={trial_hash}: "
            f"missing classifications for {missing}"
        )
        with counter_lock:
            innov_counter["coverage_error"] += 1
        # DO NOT save this trial so it can be re-run next time
        return

    # Optional: sanity check that each value has classification + explanation
    for d in investigational_products:
        meta = classifications.get(d, {})
        if not isinstance(meta, dict):
            print(f"⚠️ [{idx}/{total}] Invalid meta for {d} in trial_hash={trial_hash}")
            with counter_lock:
                innov_counter["parse_error"] += 1
            return
        if "classification" not in meta or "explanation" not in meta:
            print(f"⚠️ [{idx}/{total}] Missing fields for {d} in trial_hash={trial_hash}")
            with counter_lock:
                innov_counter["parse_error"] += 1
            return

    mapped = {
        "trial_hash": trial_hash,
        "investigational_products": investigational_products,
        "classifications": classifications,
        "source": "llm",
    }

    # Save per-trial JSON
    out_fp.write_text(json.dumps(mapped, ensure_ascii=False, indent=2), encoding="utf-8")

    # Log entry
    log_payload = {
        "token": token,
        "hash_id": hash_id,
        "model": MODEL,
        "prompt": prompt,
        "structured_response": json.dumps(mapped, ensure_ascii=False, indent=2),
        "raw_response": repr(raw_response),
        "total_cost": total_cost,
        "time_elapsed": elapsed,
    }
    (INNOV_LOG_DIR / f"{hash_id}.json").write_text(
        json.dumps(log_payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # Update master
    with master_lock:
        master_innov[trial_hash] = mapped
        MASTER_INNOV_PATH.write_text(
            json.dumps(master_innov, ensure_ascii=False, indent=2),
            encoding="utf-8"
        )

    with counter_lock:
        innov_counter["processed"] += 1
        if innov_counter["processed"] % 50 == 0:
            print(f"Progress: processed {innov_counter['processed']} trials for innovation status...")


# -------------------------------------------------
# LOAD & MERGE DATA
# -------------------------------------------------
# Load breakdown (investigational products + all drug-role cols)
df_breakdown = pd.read_csv(PRODUCT_BREAKDOWN_CSV, dtype=str).fillna("")

# Reuse parse_listish from previous cell
df_breakdown["investigational_products_parsed"] = df_breakdown["investigational_products"].apply(parse_listish)
mask_has_inv = df_breakdown["investigational_products_parsed"].apply(lambda x: len(x) > 0)

# Restrict to rows with investigational products
df_breakdown_sub = df_breakdown.loc[mask_has_inv].copy()

# All columns from trial_product_breakdown.csv except trial_hash (which is already separate)
BREAKDOWN_COLS = [c for c in df_breakdown_sub.columns if c != "trial_hash"]

# Load raw trials (for RELEVANT_COLS)
df_trials = pd.read_csv(TRIALS_WITH_HASH_CSV, dtype=str).fillna("")

# Merge on trial_hash; keep all breakdown columns + investigational_products_parsed + RELEVANT_COLS
df_merged = df_breakdown_sub.merge(
    df_trials[["trial_hash"] + RELEVANT_COLS],
    on="trial_hash",
    how="left",
)

innov_rows = df_merged.to_dict(orient="records")
total_innov = len(innov_rows)
print(f"Loaded {total_innov} trials with investigational products for innovation-status classification.")

# -------------------------------------------------
# RUN CONCURRENTLY
# -------------------------------------------------

with ThreadPoolExecutor(max_workers=MAX_WORKERS_INNOV) as ex:
    futures = {
        ex.submit(process_innov_row, row, idx, total_innov, BREAKDOWN_COLS): row.get("trial_hash")
        for idx, row in enumerate(innov_rows, start=1)
    }
    for fut in as_completed(futures):
        th = futures[fut]
        try:
            fut.result()
        except Exception as e:
            print(f"⚠️ Worker error (innovation) trial_hash={th}: {e}")

print(
    f"✅ Trial investigational-drug innovation classification complete. "
    f"processed={innov_counter['processed']}, "
    f"skipped={innov_counter['skipped_existing']}, "
    f"llm_error={innov_counter['llm_error']}, "
    f"parse_error={innov_counter['parse_error']}, "
    f"coverage_error={innov_counter['coverage_error']}"
)
print(f"Classifications directory: {INNOV_DIR}")
print(f"Log directory:             {INNOV_LOG_DIR}")
print(f"Master classifications:    {MASTER_INNOV_PATH}")

Loaded 179 trials with investigational products for innovation-status classification.
✅ Trial investigational-drug innovation classification complete. processed=0, skipped=179, llm_error=0, parse_error=0, coverage_error=0
Classifications directory: cache/trial_investigational_drugs_classifications
Log directory:             cache/trial_investigational_drugs_classifications_log
Master classifications:    cache/trial_investigational_drugs_classifications_master.json


In [8]:
import json
from pathlib import Path

LOG_DIR = Path("cache/trial_investigational_drugs_classifications_log")

total_cost = 0.0
num_entries = 0
costs = []

for fp in LOG_DIR.glob("*.json"):
    try:
        log = json.loads(fp.read_text(encoding="utf-8"))
        c = float(log.get("total_cost") or 0.0)
        total_cost += c
        costs.append((fp.name, c))
        num_entries += 1
    except Exception as e:
        print(f"⚠️ Error reading {fp.name}: {e}")

# Sort descending by cost
costs_sorted = sorted(costs, key=lambda x: x[1], reverse=True)

print("========== LLM COST SUMMARY ==========")
print(f"Total LLM cost:             ${total_cost:,.4f}")
print(f"Number of logged trials:     {num_entries}")
if num_entries > 0:
    print(f"Average cost per trial:      ${total_cost / num_entries:,.4f}")
print("")

print("Top 10 most expensive trials:")
for name, c in costs_sorted[:10]:
    print(f"  {name}: ${c:,.4f}")

print("========================================")

Total LLM cost:             $2.0853
Number of logged trials:     179
Average cost per trial:      $0.0116

Top 10 most expensive trials:
  tid_8b4d60a5fddc078962af34399d7e342c.json: $0.0277
  tid_9727cefa81bf0a9c341273bce42d3346.json: $0.0254
  tid_fc93655913a5e1b233a8077e9fc758c6.json: $0.0226
  tid_a1c1e47263bd9f338f83e52f97565ff1.json: $0.0222
  tid_b29013cdbc706b95776d47be1d6e98e6.json: $0.0219
  tid_94883aa2d583afced004e22a7991ef3e.json: $0.0208
  tid_9dc4ace9308864c9f8a619d6abe32011.json: $0.0207
  tid_99fac3ebe48aad5ebc1077142f61d5eb.json: $0.0205
  tid_4f644d4f81a34d114e8e22321d3af440.json: $0.0187
  tid_a50324f4d36f5cc93b795ec7f8b7005b.json: $0.0186


In [10]:
import json
import pandas as pd

OUT_CSV = BASE_DIR / "trial_investigational_drugs_classifications.csv"

rows = []

for fp in INNOV_DIR.glob("*.json"):
    try:
        obj = json.loads(fp.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"⚠️ Error reading {fp.name}: {e}")
        continue

    trial_hash = obj.get("trial_hash")
    if not trial_hash:
        print(f"⚠️ Missing trial_hash in {fp.name}, skipping")
        continue

    inv_products_raw = obj.get("investigational_products") or []
    classifications_map = obj.get("classifications") or {}

    flat_products = []
    flat_classifications = []

    for drug_raw in inv_products_raw:
        # drug_raw might be "['inetetamab', 'toripalimab']" or just "SSGJ-707"
        if isinstance(drug_raw, str):
            parsed_names = parse_listish(drug_raw)  # from earlier cell (uses ast.literal_eval)
        else:
            parsed_names = [drug_raw]

        # Prefer classification using the exact key that was sent to the model
        meta = classifications_map.get(drug_raw, {})
        cls = meta.get("classification", "")

        # If not found, try each parsed name as a key
        if not cls:
            for name in parsed_names:
                meta_n = classifications_map.get(name, {})
                if "classification" in meta_n:
                    cls = meta_n.get("classification", "")
                    break

        if not cls:
            print(
                f"⚠️ Missing classification for raw drug {drug_raw!r} in "
                f"trial_hash={trial_hash}, file={fp.name}"
            )

        # Add one entry per parsed name so both lists are flat and aligned
        for name in parsed_names:
            flat_products.append(name)
            flat_classifications.append(cls)

    # Sanity check: lengths must match
    if len(flat_products) != len(flat_classifications):
        print(
            f"⚠️ Length mismatch for trial_hash={trial_hash}: "
            f"{len(flat_products)} products vs {len(flat_classifications)} classifications"
        )

    rows.append(
        {
            "trial_hash": trial_hash,
            # store as JSON stringified flat lists
            "investigational_products": json.dumps(flat_products, ensure_ascii=False),
            "investigational_products_classifications": json.dumps(flat_classifications, ensure_ascii=False),
        }
    )

df_out = pd.DataFrame(rows).sort_values("trial_hash")

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(OUT_CSV, index=False)

print(f"✅ Saved investigational drug classifications to {OUT_CSV}")
print(df_out.head().to_markdown(index=False))

✅ Saved investigational drug classifications to cache/trial_investigational_drugs_classifications.csv
| trial_hash                           | investigational_products                      | investigational_products_classifications   |
|:-------------------------------------|:----------------------------------------------|:-------------------------------------------|
| tid_0541995757b10e613a42173d6b8ddc09 | ["Cinacalcet hydrochloride"]                  | ["Generic"]                                |
| tid_0da20e863cfc5f3e369868462bff74e0 | ["NuPIAO"]                                    | ["Innovative"]                             |
| tid_0e8fa21079f928135dfc6164a15285f8 | ["SSS-17"]                                    | ["Innovative"]                             |
| tid_0f04ddb3d522d528d083d7d5c43d1e18 | ["Metformin hydrochloride sustained-release"] | ["Generic"]                                |
| tid_10562c0430b8b9bae93c94cadfb0a129 | ["RD-01"]                                     | ["Inn

#### Task 3