In [None]:
import os
from functools import partial
import pandas as pd
import numpy as np
import warnings
import contextlib
import io

from madrigal.utils import BASE_DIR
from madrigal.evaluate.predict import get_drugbank_scores_wrapper, get_twosides_scores_wrapper

In [None]:
drug_metadata = pd.read_pickle(os.path.join(BASE_DIR, 'processed_data/views_features_new/combined_metadata_ddi.pkl'))
drug_metadata['view_str'] = 1
print(drug_metadata.shape[0])

drugbank_ddi_classes = pd.read_pickle(
    BASE_DIR + "processed_data/polypharmacy_new/DrugBank/drugbank_ddi_directed_final_label_map.pkl"
)
drugbank_ddi_df = pd.read_csv(BASE_DIR + "processed_data/polypharmacy_new/DrugBank/drugbank_ddi_directed_final.tsv", index_col=0)

twosides_ddi_classes = pd.read_pickle(
    BASE_DIR + "processed_data/polypharmacy_new/TWOSIDES/twosides_ddi_directed_final_label_map.pkl"
)
twosides_ddi_df = pd.read_csv(BASE_DIR + "processed_data/polypharmacy_new/TWOSIDES/twosides_ddi_directed_final.tsv", index_col=0)

get_twosides_scores = partial(get_twosides_scores_wrapper, twosides_ddi_classes=twosides_ddi_classes, ckpt_list=['effortless-dust-7', 'fresh-flower-8', 'vague-violet-9'])
get_drugbank_scores = partial(get_drugbank_scores_wrapper, ckpt_list=['drawn-grass-4', 'misty-oath-5', 'whole-fog-7', 'snowy-serenity-8', 'revived-aardvark-8'])

11601
21842


# Comparing different combos in the same trial (for the same indication)

## Get the right trials

Get trials that:
- Is above Phase I
- Started after year 2000
- Has adverse events data
- Has at least two arms with exactly two different small molecule drugs that can be mapped to our DrugBank identifiers

In [None]:
all_dbids = set(drug_metadata["node_id"].dropna().values)
clinical_trial_combos = pd.read_csv(BASE_DIR + "raw_data/cdcdb/aact_combs__with_identifiers.csv", index_col=0)
conditions = pd.read_csv(BASE_DIR + "raw_data/cdcdb/conditions_df.csv")
design_groups = pd.read_csv(BASE_DIR + "raw_data/cdcdb/design_group_df.csv")
mesh_terms = pd.read_csv(BASE_DIR + "raw_data/cdcdb/mesh_terms_df.csv")
trials_metadata = pd.read_csv(BASE_DIR + "raw_data/cdcdb/trials_df.csv")

In [None]:
assert trials_metadata["nct_id"].nunique() == design_groups["nct_id"].nunique()
trials_metadata_filtered = trials_metadata.query("phase not in ['Phase 1', 'Early Phase 1'] and overall_status not in ['Recruiting', 'Not yet recruiting', 'Suspended', 'Withdrawn']").dropna(subset=["study_start_date"])
trials_metadata_filtered["study_start_year"] = trials_metadata_filtered["study_start_date"].apply(lambda x: x.split("-")[0]).astype(int)
assert (trials_metadata_filtered["study_start_year"] > 2024).sum() == 0
trials_metadata_filtered = trials_metadata_filtered[trials_metadata_filtered["study_start_year"].astype(int) >= 2000]

assert trials_metadata_filtered[(trials_metadata_filtered["number_of_arms"].isna()) & (trials_metadata_filtered["number_of_groups"].isna())].shape[0] == 0
assert trials_metadata_filtered[(~trials_metadata_filtered["number_of_arms"].isna()) & (~trials_metadata_filtered["number_of_groups"].isna())].shape[0] == 0

trials_metadata_filtered.shape

(11347, 11)

In [None]:
design_groups_filtered = design_groups.drop(columns=["pubchem_identifier"]).dropna(subset=["drugbank_identifier"])

design_groups_filtered["drugbank_identifier_in"] = design_groups_filtered["drugbank_identifier"].apply(
    lambda lst: 
    sorted([
        dbid for dbid in lst.strip("[]\"").split("\", \"") 
    ])
)

# Only keep those arms where:
# 1. there are two drugs
design_groups_filtered = design_groups_filtered[design_groups_filtered["drugbank_identifier_in"].apply(len) == 2]
# 2. both are DrugBank-identifiable and small molecules (i.e., within our database)
design_groups_filtered["drugbank_identifier_in"] = design_groups_filtered["drugbank_identifier_in"].apply(
    lambda lst: [dbid for dbid in lst if dbid.startswith("DB") and (len(dbid) == 7) and dbid in all_dbids]
)
design_groups_filtered = design_groups_filtered[design_groups_filtered["drugbank_identifier_in"].apply(len) == 2]

design_groups_filtered["drugbank_identifier_in"] = design_groups_filtered["drugbank_identifier_in"].apply(lambda lst: ";".join(lst))
design_groups_filtered = design_groups_filtered.drop_duplicates(subset=["nct_id", "design_group_id", "drugbank_identifier_in"])
print(design_groups_filtered.shape[0])

design_groups_filtered = design_groups_filtered.query("nct_id in @trials_metadata_filtered.nct_id.values")
design_groups_filtered

11575


Unnamed: 0,nct_id,design_group_id,group_type,title,interventions_names,selected_name,drugbank_identifier,drugbank_identifier_in
5,NCT00000451,91533932,Experimental,2,"[[""sertraline"", ""Zoloft""], [""naltrexone"", ""Rev...","[[""sertraline"", ""Zoloft""], [""naltrexone"", ""ReV...","[""DB01104"", ""DB00704""]",DB00704;DB01104
177,NCT00004235,91536615,Experimental,irinotecan + docetaxel,"[[""irinotecan hydrochloride""], [""docetaxel""]]","[[""irinotecan""], [""docetaxel""]]","[""DB00762"", ""DB01248""]",DB00762;DB01248
178,NCT00004259,91126393,Experimental,Pilot Arm #1: RT+TMZ+BCNU,"[[""TMZ 150mg/m2 six 6-week cycles""], [""BCNU 20...","[[""temozolomide""], [""carmustine""]]","[""DB00853"", ""DB00262""]",DB00262;DB00853
179,NCT00004259,91126394,Experimental,Pilot Arm #2: RT+TMZ+BCNU,"[[""BCNU 150mg/m2""], [""TMZ 150mg/m2 six 8-week ...","[[""carmustine""], [""temozolomide""]]","[""DB00262"", ""DB00853""]",DB00262;DB00853
180,NCT00004259,91126392,Active Comparator,RT + BCNU/CCNU,"[[""BCNU 80mg/m2""], [""CCNU""]]","[[""carmustine""], [""ccnu""]]","[""DB00262"", ""DB01206""]",DB00262;DB01206
...,...,...,...,...,...,...,...,...
31209,NCT06307288,91427316,Experimental,tranilast combined with minocycline treatment ...,"[[""tranilast""], [""minocycline""]]","[[""tranilast""], [""minocycline""]]","[""DB07615"", ""DB01017""]",DB01017;DB07615
31217,NCT06310642,91027353,Experimental,Prochlorperazine 10 mg,"[[""Diphenhydramine"", ""Benadryl""], [""Prochlorpe...","[[""diphenhydramine"", ""Benadryl""], [""prochlorpe...","[""DB01075"", ""DB00433""]",DB00433;DB01075
31222,NCT06315634,91532571,Experimental,Heavy Bupivacaine plus dexmedetomidine,"[[""Bupivacain"", ""Spinal bupivacaine""], [""Dexme...","[[""bupivacaine"", ""bupivacaine""], [""dexmedetomi...","[""DB00297"", ""DB00633""]",DB00297;DB00633
31223,NCT06315634,91532572,Experimental,Heavy Bupivacaine plus midazolam,"[[""Bupivacain"", ""Spinal bupivacaine""], [""Midaz...","[[""bupivacaine"", ""bupivacaine""], [""midazolam"",...","[""DB00297"", ""DB00683""]",DB00297;DB00683


In [18]:
design_groups_multi_combo_same_trial = design_groups_filtered.groupby("nct_id").agg(list)
design_groups_multi_combo_same_trial = design_groups_multi_combo_same_trial[design_groups_multi_combo_same_trial["drugbank_identifier_in"].apply(lambda lst: len(set([med for med in lst if len(med) == 15])) > 1)]
nct_ids_to_query = design_groups_multi_combo_same_trial.index.values
design_groups_multi_combo_same_trial

Unnamed: 0_level_0,design_group_id,group_type,title,interventions_names,selected_name,drugbank_identifier,drugbank_identifier_in
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT00004259,"[91126393, 91126394, 91126392]","[Experimental, Experimental, Active Comparator]","[Pilot Arm #1: RT+TMZ+BCNU, Pilot Arm #2: RT+T...","[[[""TMZ 150mg/m2 six 6-week cycles""], [""BCNU 2...","[[[""temozolomide""], [""carmustine""]], [[""carmus...","[[""DB00853"", ""DB00262""], [""DB00262"", ""DB00853""...","[DB00262;DB00853, DB00262;DB00853, DB00262;DB0..."
NCT00033657,"[91521773, 91521772]","[Experimental, Experimental]",[Paclitaxel / Cisplatin / Radiation therapy (A...,"[[[""paclitaxel"", ""NSC 125973"", ""Taxol""], [""cis...","[[[""paclitaxel"", ""taxol""], [""cisplatin"", ""cisp...","[[""DB01229"", ""DB04620""], [""DB00762"", ""DB04620""]]","[DB01229;DB04620, DB00762;DB04620]"
NCT00041015,"[91656038, 91656039]","[Active Comparator, Experimental]","[oral topotecan plus cisplatin IV, Cisplatin I...","[[[""topotecan hydrochloride""], [""cisplatin""]],...","[[[""topotecan hydrochloride""], [""cisplatin""]],...","[[""DB01030"", ""DB00515""], [""DB00515"", ""DB00773""]]","[DB00515;DB01030, DB00515;DB00773]"
NCT00045162,"[91073867, 91073866]","[Active Comparator, Active Comparator]","[2, 1]","[[[""etoposide""], [""cisplatin""]], [[""irinotecan...","[[[""etoposide""], [""cisplatin""]], [[""irinotecan...","[[""DB00773"", ""DB00515""], [""DB00762"", ""DB00515""]]","[DB00515;DB00773, DB00515;DB00762]"
NCT00054327,"[91089487, 91089484]","[Experimental, Experimental]","[Regimen C, Regimen A]","[[[""busulfan"", ""Myleran""], [""cyclophosphamide""...","[[[""busulfan"", ""Myleran""], [""cyclophosphamide""...","[[""DB01008"", ""DB00531""], [""DB00531"", ""DB00987""]]","[DB00531;DB01008, DB00531;DB00987]"
...,...,...,...,...,...,...,...
NCT05901441,"[91491913, 91491914, 91491915, 91491912]","[Experimental, Experimental, Active Comparator...","[H2 group, H3 group, SF group, H1 group]","[[[""Hydromorphone""], [""Ropivacaine""]], [[""Hydr...","[[[""hydromorphone""], [""ropivacaine""]], [[""hydr...","[[""DB00327"", ""DB00296""], [""DB00327"", ""DB00296""...","[DB00296;DB00327, DB00296;DB00327, DB00296;DB0..."
NCT05994287,"[91603151, 91603150]","[Experimental, Experimental]","[Drug: ketorolac and celecoxib, Drug: aspirin ...","[[[""Ketorolac""], [""Celecoxib""]], [[""Ketorolac""...","[[[""ketorolac""], [""celecoxib""]], [[""ketorolac""...","[[""DB00465"", ""DB00482""], [""DB00465"", ""DB00945""]]","[DB00465;DB00482, DB00465;DB00945]"
NCT06090565,"[90936236, 90936237]","[Experimental, Active Comparator]","[Cefixime plus doxycycline, Ceftriaxone plus a...","[[[""Cefixime"", ""Cefixime (ATC code J01DD08)""],...","[[[""cefixime"", ""cefixime""], [""doxycycline""]], ...","[[""DB00671"", ""DB00254""], [""DB00207"", ""DB01212""]]","[DB00254;DB00671, DB00207;DB01212]"
NCT06181188,"[91670960, 91670961]","[Experimental, Active Comparator]","[Ketamine and Midazolam, Midazolam and Fentanyl]","[[[""Midazolam"", ""Versed""], [""Ketamine"", ""Ketal...","[[[""midazolam"", ""Versed""], [""ketamine"", ""Ketal...","[[""DB00683"", ""DB01221""], [""DB00683"", ""DB00813""]]","[DB00683;DB01221, DB00683;DB00813]"


In [None]:
import requests
from typing import List

def query_clinicaltrials_api(nct_ids: List[str]) -> pd.DataFrame:
    """
    Query ClinicalTrials.gov API v2 for specific NCT IDs and return data about having results
    
    Args:
        nct_ids: List of NCT IDs to query
        
    Returns:
        DataFrame with NCT IDs and whether they have results
    """
    # Base URL for ClinicalTrials.gov API v2
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    
    # Limit to 100 NCT IDs per request to avoid exceeding URL length limits
    results = []
    print(len(nct_ids))
    for i in range(0, len(nct_ids), 100):
        batch_nct_ids = nct_ids[i:i+100]
        
        # Build query parameters
        query_params = {
            "query.id": ",".join(batch_nct_ids),
            "fields": "NCTId,hasResults",
            "format": "json",
            "pageSize": 100,
        }
        
        try:
            # Make API request
            response = requests.get(base_url, params=query_params)
            response.raise_for_status()
            
            # Process response
            studies_data = response.json()
            
            # Extract relevant information
            for study in studies_data.get("studies", []):
                results.append({
                    "nct_id": study.get("protocolSection", {}).get("identificationModule", {}).get("nctId", ""),
                    "has_results": study.get("hasResults", False),
                    "results_first_submit_date": study.get("resultsFirstSubmitDate", None),
                })
                
        except requests.exceptions.RequestException as e:
            print(f"Error querying ClinicalTrials.gov API: {e}")
            continue
        
    # Convert results to DataFrame
    return pd.DataFrame(results)


In [None]:
nct_results_df = query_clinicaltrials_api(nct_ids_to_query)

print(f"Total trials queried: {len(nct_results_df)}")
print(f"Trials with results: {nct_results_df['has_results'].sum()}")

nct_results_df.head()

408
Total trials queried: 408
Trials with results: 171


Unnamed: 0,nct_id,has_results,results_first_submit_date
0,NCT00536601,True,
1,NCT00124566,False,
2,NCT00798460,False,
3,NCT01005680,True,
4,NCT00473694,True,


In [20]:
nct_ids_multi_combo_same_trial_has_results = nct_results_df.query("has_results")["nct_id"].values
design_groups_multi_combo_same_trial = design_groups_multi_combo_same_trial.query("nct_id in @nct_ids_multi_combo_same_trial_has_results")

np.save("./nct_ids_multi_combo_same_trial_has_results.npy", nct_ids_multi_combo_same_trial_has_results)
design_groups_multi_combo_same_trial.to_pickle("./design_groups_multi_combo_same_trial.pkl")

In [21]:
trials_metadata_filtered_multi_combo_same_trial = trials_metadata_filtered.query("nct_id in @nct_ids_multi_combo_same_trial_has_results")
trials_metadata_filtered_multi_combo_same_trial["num_patients_per_arm"] = trials_metadata_filtered_multi_combo_same_trial["enrollment"] / trials_metadata_filtered_multi_combo_same_trial["number_of_arms"]
clinical_trial_combos_multi_combo_same_trial = clinical_trial_combos.query("nct_id in @nct_ids_multi_combo_same_trial_has_results")
conditions_multi_combo_same_trial = conditions.query("nct_id in @nct_ids_multi_combo_same_trial_has_results").drop_duplicates(subset=["nct_id", "condition"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trials_metadata_filtered_multi_combo_same_trial["num_patients_per_arm"] = trials_metadata_filtered_multi_combo_same_trial["enrollment"] / trials_metadata_filtered_multi_combo_same_trial["number_of_arms"]


In [22]:
trials_metadata_filtered_multi_combo_same_trial.to_pickle("./trials_metadata_filtered_multi_combo_same_trial.pkl")
clinical_trial_combos_multi_combo_same_trial.to_pickle("./clinical_trial_combos_multi_combo_same_trial.pkl")
conditions_multi_combo_same_trial.to_pickle("./conditions_multi_combo_same_trial.pkl")

## Get AEs for these trials

In [None]:
nct_ids_multi_combo_same_trial_has_results = np.load("./nct_ids_multi_combo_same_trial_has_results.npy", allow_pickle=True)
design_groups_multi_combo_same_trial = pd.read_pickle("./design_groups_multi_combo_same_trial.pkl")
trials_metadata_filtered_multi_combo_same_trial = pd.read_pickle("./trials_metadata_filtered_multi_combo_same_trial.pkl")
conditions_multi_combo_same_trial = pd.read_pickle("./conditions_multi_combo_same_trial.pkl")

Has at least 20 participants per arm on average

In [None]:
PER_ARM_THRESHOLD = 20
trials_metadata_filtered_multi_combo_same_trial = trials_metadata_filtered_multi_combo_same_trial.dropna(subset=["number_of_arms"]).query("num_patients_per_arm >= @PER_ARM_THRESHOLD").sort_values("num_patients_per_arm", ascending=False)
trials_metadata_filtered_multi_combo_same_trial.head(10)

Unnamed: 0,nct_id,study_start_date,overall_status,phase,completion_date,enrollment,enrollment_type,number_of_arms,number_of_groups,why_stopped,study_start_year,num_patients_per_arm
433,NCT00069121,2003-04-18,Completed,Phase 3,2011-04-21,1886.0,Actual,2.0,,,2003,943.0
3421,NCT00856284,2009-03-31,Completed,Phase 3,2012-10-31,2639.0,Actual,3.0,,,2009,879.666667
8475,NCT02293395,2015-04-20,Completed,Phase 2,2016-10-14,3037.0,Actual,4.0,,,2015,759.25
4350,NCT01120691,2010-04-30,Completed,Phase 3,2012-07-31,2224.0,Actual,3.0,,,2010,741.333333
2677,NCT00660907,2008-03-31,Completed,Phase 3,2013-01-31,1217.0,Actual,2.0,,,2008,608.5
2824,NCT00701090,2008-05-31,Completed,Phase 3,2009-10-31,1035.0,Actual,2.0,,,2008,517.5
15610,NCT04640168,2020-12-02,Completed,Phase 3,2021-06-18,1010.0,Actual,2.0,,,2020,505.0
3819,NCT00968812,2009-09-30,Completed,Phase 3,2013-01-31,1452.0,Actual,3.0,,,2009,484.0
1325,NCT00343460,2006-06-30,Completed,Phase 3,2009-02-28,1428.0,Actual,3.0,,,2006,476.0
5773,NCT01568866,2012-06-20,Completed,Phase 3,2018-02-05,929.0,Actual,2.0,,,2012,464.5


Use [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) to pull CT AE data. Please clone https://github.com/mims-harvard/ToolUniverse before running the cell.

In [None]:
import sys
sys.path.append('/path/to/ToolUniverse/src')
from tooluniverse import ToolUniverse

tooluni = ToolUniverse()
tooluni.load_tools()
nct_ids = trials_metadata_filtered_multi_combo_same_trial["nct_id"].values.tolist()
nct_ids += [
    # "NCT04640168", 
    # "NCT01568866", 
    # "NCT00045162",  # the above three are in the list already
    "NCT00575588",
    "NCT03151811",
]  # NOTE: Supplement with missed but known good trials from OpenAI Deep Research

query = {
    "name": "extract_clinical_trial_adverse_events", 
    "arguments":{
        "nct_ids": nct_ids, 
        "adverse_event_type":"all"
    }
}
ae_results = tooluni.run(query)

In [9]:
for res in ae_results:
    if "serious_adverse_events" not in res:
        print(res["NCT ID"], "missing serious AEs")
    if "other_adverse_events" not in res:
        print(res["NCT ID"], "missing other AEs")
    if "serious_adverse_events" not in res or "other_adverse_events" not in res:
        continue
    if len(res["other_adverse_events"]) <= 2 and len(res["serious_adverse_events"]) <= 2:
        print(res["NCT ID"], "too few AEs")

NCT00926796 missing serious AEs
NCT02486627 missing other AEs
NCT04510194 missing serious AEs
NCT00758836 missing serious AEs
NCT00406393 missing other AEs
NCT02833948 too few AEs
NCT02847494 missing serious AEs
NCT02384070 missing serious AEs
NCT02384070 missing other AEs
NCT01709318 missing serious AEs
NCT04726969 missing serious AEs
NCT02494180 missing serious AEs
NCT02494180 missing other AEs
NCT03068897 missing serious AEs
NCT00322465 missing serious AEs
NCT00801138 missing serious AEs
NCT00801138 missing other AEs
NCT02486328 missing serious AEs
NCT02486328 missing other AEs
NCT01848899 missing serious AEs
NCT02260648 missing serious AEs
NCT01660191 missing serious AEs
NCT02597907 missing serious AEs
NCT02597907 missing other AEs
NCT02787057 missing serious AEs
NCT00803010 missing other AEs
NCT02653144 missing serious AEs
NCT02653144 missing other AEs
NCT02972502 missing serious AEs
NCT02972502 missing other AEs
NCT01822548 missing serious AEs
NCT02024724 missing serious AEs
NCT0

Manually examine all trials. Either exclude, find the correct arm pairs for comparison, or just leave it as it is (for those with two comparable arms)

In [10]:
from itertools import combinations

to_remove_nct_ids = [
    "NCT02014558",  # Cannot resolve dosages
    "NCT02039674",  # Primary comparator is `Pembrolizumab`
    "NCT01909804",  # No combo comparison can be made
    "NCT02049957",  # No combo comparison can be made for arms with sufficient sample size
    "NCT01415752",  # No combo comparison can be made
    "NCT02024607",  # No combo comparison can be made for arms with sufficient sample size
    "NCT00536601",  # Unrelated therapies being tested
    "NCT01709318",  # Cannot resolve dosages
    "NCT02142738",  # Primary comparator is `Pembrolizumab`
    "NCT04006288",  # No AEs
    "NCT01546038",  # No combo comparison can be made
    "NCT00555399",  # No combo comparison can be made for arms with sufficient sample size
    "NCT02293395",  # No combo comparison can be made
    "NCT00004259",  # No combo comparison can be made
    "NCT02272790",  # Cannot resolve dosages
    "NCT01188551",  # NO AEs
    "NCT01867710",  # Cannot resolve dosages
    "NCT00801138",  # No AEs
    "NCT03068897",  # No AEs
    "NCT01119066",  # Unrelated therapies being tested
    "NCT01351753",  # No AEs
    "NCT02430090",  # No AEs
    "NCT02616523",  # No AEs
    "NCT02525796",  # No AEs
    "NCT03303339",  # No combo comparison can be made for arms with sufficient sample size
    "NCT01114971",  # No combo comparison can be made
    "NCT03649711",  # No AEs
    "NCT02279719",  # No combo comparison can be made for arms with sufficient sample size
    "NCT02653144",  # No AEs
    "NCT02981342",  # No combo comparison can be made
    "NCT02677922",  # No combo comparison can be made
    "NCT01660191",  # No AEs
    "NCT01120691",  # No combo comparison can be made
    "NCT00542269",  # No AEs
    "NCT00718315",  # topical formulation
    "NCT00993655",  # No combo comparison can be made
    "NCT00343460",  # No combo comparison can be made
    "NCT00191646",  # No combo comparison can be made
    "NCT01308567",  # Cannot resolve dosages
    "NCT02085408",  # No combo comparison can be made 
    "NCT02486627", "NCT04510194", "NCT00406393", "NCT02384070", "NCT01953926", "NCT02494180", "NCT02725268", "NCT02486328", "NCT01848899",  # Not enough AEs
    "NCT00856284",  # Cannot resolve dosages
    "NCT02597907",  # No AEs
    "NCT02787057",  # No AEs
    "NCT00803010",  # No AEs
    "NCT02972502",  # No AEs
    "NCT01718353",  # No combo comparison can be made for arms with sufficient sample size
    "NCT02024724",  # No AEs
    "NCT02096354",  # No combo comparison can be made for arms with sufficient sample size
    "NCT01932762",  # No combo comparison can be made for arms with sufficient sample size
    "NCT00338962",  # No AEs
    "NCT00429026",  # No combo comparison can be made
    
]
# Manually check trials with >= 5 arms
limit_groups_nct_ids = {
    "NCT00112502": list(combinations(['Arm IV: TMZ + Isotretinoin', 'Arm II: TMZ + Thalidomide', 'Arm III: TMZ + Celecoxib'], 2)),
    "NCT01259297": [["Double Blind Period: Aliskiren + Hydrochlorothiazide (HCTZ)", "Double Blind Period: Aliskiren + Amlodipine"]],
    "NCT02599324": [["Cohort 1 (RCC) Phase 2: Ibrutinib 840 mg + Everolimus", "Cohort 2 (UC) Phase 2: Ibrutinib 840 mg + Paclitaxel"]],
    "NCT04510194": [['Treatment Arm - Metformin and Fluvoxamine Group', 'Treatment Arm - Metformin and Ivermectin Group']],
    "NCT01289119": [['Pioglitazone + Alogliptin Add-on Therapy', 'Metformin + Alogliptin Add-on Therapy']],
    "NCT01953926": [['Neratinib + Fulvestrant', 'Neratinib + Paclitaxel']],
    "NCT01215851": list(combinations(['TMC207 and PA-824', 'PA-824 and Pyrazinamide', 'TMC207 and Pyrazinamide'], 2)),
    "NCT04413617": [['PF-06650833 400mg MR + Tofacitinib 11mg MR', 'PF-06650833 400mg MR + PF-06651600 100mg']],
    "NCT01052272": [['Ramipril and Allopurinol', 'Candesartan cilexetil and Allopurinol']],
    "NCT00338962": [['Paroxetine and Naltrexone', 'Desipramine and Naltrexone']],
    "NCT01932762": [['GT2: Grazoprevir + RBV (Arm B1)', 'GT 4,5,6: Grazoprevir + Elbasvir (Arm B3)']],
    "NCT02725268": [['Paclitaxel 80 mg/m^2 + Sapanisertib 4 mg', 'Sapanisertib 4 mg + MLN1117 200 mg']],
    "NCT01106677": [['Canagliflozin 100 mg: Baseline to Week 52', 'Sitagliptin 100mg: Baseline to Week 52']],
    "NCT00322465": [['Azithromycin + Tinidazole', 'Doxycycline + Tinidazole']],
    "NCT01505114": list(combinations(['Arm 4', 'Arm 2', 'Arm 3'], 2)),
    "NCT00379821": [['Chloroquine Plus Artesunate', 'CQ Plus Azithromycin']],
    "NCT00758836": [['Telcagepant 280 mg +APAP 1000 mg', 'Telcagepant 280 mg +Ibuprofen 400 mg']],
    "NCT00356525": [['One Year or Greater: Pemetrexed + Carboplatin', 'One Year or Greater: Pemetrexed + Gemcitabine']],
    "NCT01659658": list(combinations(['Arm A: Ixazomib + Dexamethasone', 'Arm B: Dexamethasone + Cyclophosphamide', 'Arm B: Dexamethasone + Thalidomide', 'Arm B: Dexamethasone + Lenalidomide', 'Arm B: Dexamethasone + Melphalan'], 2)),
    "NCT02294396": list(combinations(['Mirabegron + Imidafenacin', 'Mirabegron + Solifenacin', 'Mirabegron + Tolterodine', 'Mirabegron + Propiverine'], 2)),
    "NCT00191854": list(combinations(["Gemicitabine + Paclitaxel", "Gemcitabine + Carboplatin", "Gemcitabine + Cisplatin"], 2)),
    "NCT01436643": list(combinations(['Fluoxetine and Fingolimod', 'Citalopram and Fingolimod', 'Venlafaxine and Fingolimod'], 2)),
    "NCT02448537": [["Stratum A", "Stratum B"]],
    "NCT05487196": [['Clonidine (Maternal)', 'Dexmedetomidine (Maternal)', 'Ropivacaine + Fentanyl (Maternal)']], 
    "NCT02260648": [['Evacetrapib', 'Ezetimibe']],
    "NCT02684058": [["LGG Cohort: Carboplatin and Vincristine (On-treatment)", "LGG Cohort: Dabrafenib and Trametinib (On-treatment)"]],
    "NCT00308750": [["Pemetrexed/Carboplatin", "Docetaxel/Carboplatin"]],
    "NCT04726969": [["Arm A: Moxidectin and Albendazole", "Arm C: Ivermectin and Albendazole"]],
    "NCT00541775": [["Sitagliptin", "Rosiglitazone"]],
    
    "NCT00236899": [
        ["Arm A: Docetaxel and Gemcitabine (Tri-weekly)", "Arm B: Paclitaxel and Gemcitabine (Tri-weekly)"],
        ["Arm C: Docetaxel and Gemcitabine (Weekly)", "Arm D: Paclitaxel and Gemcitabine (Weekly)"],
    ],
    "NCT00473694": [
        ["Rocuronium+Sugammadex", "Rocuronium+Neostigmine"],
        ["Vecuronium+Sugammadex", "Vecuronium+Neostigmine"],
        ["Rocuronium+Sugammadex", "Vecuronium+Sugammadex"],
        ["Rocuronium+Neostigmine", "Vecuronium+Neostigmine"],
    ],
    # "NCT00856284": [
    #     ["Metformin + Alogliptin 12.5 mg", "Metformin + Glipizide"],
    #     ["Metformin + Alogliptin 25 mg", "Metformin + Glipizide"],
    # ],
    "NCT03887130": [
        ["Vinorelbine-Capecitabine (Arm A)", "Gemcitabine-Paclitaxel (Arm B)"],
        ["Vinorelbine-Capecitabine (Arm A)", "Gemcitabine-Docetaxel (Arm C)"],
    ],
    "NCT00968812": [
        ["Canagliflozin 100 mg: Baseline to Week 104", "Glimepiride: Baseline to Week 104"]
    ],
    "NCT01522976": [["Arm 1: Azacitidine/Lenalidomide", "Arm 3: Azacitidine/Vorinostat"]],
    "NCT02831764": [["DTG + 3TC - Double-blind Phase + Open-label Phase", "DTG + TDF/FTC - Double-blind Phase + Open-label Phase"]],
    "NCT02831673": [["DTG + 3TC-Double-blind Phase + Open-label Phase", "DTG + TDF/FTC-Double-blind Phase + Open-label Phase"]],
    "NCT01013740": [['Randomized Phase Lapatinib 1250mg QD + Capecitabine 2000mg/m2', 'Randomized Phase Lapatinib 1250mg QD + Vinorelbine 20mg/m2']],
    "NCT00069121": [['5-FU/LV MAYO CLINIC', 'XELOX'], ['5-FU/LV ROSWELL PARK', 'XELOX']]
}

In [None]:
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from statsmodels.stats.multitest import multipletests

alpha = 0.05
per_arm_threshold = 20  # per arm should have at least this many patients at risk
num_affected_threshold = 3  # at least this many patients affected in at least one arm, for a valid outcome
detectable_threshold = 1  # percentage of patients affected in at least one arm should be at least this much %

significant_aes_dfs = {}
# For all trials
for sample in ae_results:
    nct_id = sample["NCT ID"]
    # remove based on manual checks (on trials with many groups)
    if nct_id in to_remove_nct_ids:
        continue
    
    groups = sample["groups"]
    if nct_id not in {"NCT00575588", "NCT03151811"}:
        title2dbid = pd.DataFrame(design_groups_multi_combo_same_trial.query("nct_id == @nct_id")[["title", "drugbank_identifier_in"]].iloc[0].values.tolist()).T.rename(columns={0: "title", 1: "dbids"})
    elif nct_id == "NCT00575588":
        title2dbid = pd.DataFrame({
            "title": ["Saxagliptin + Metformin", "Glipizide + Metformin"],
            "dbids": ["DB06335;DB00331", "DB01067;DB00331"]
        })
    elif nct_id == "NCT03151811":
        title2dbid = pd.DataFrame({
            "title": ["Arm A: Melflufen+Dexamethasone", "Arm B: Pomalidomide+Dexamethasone"],
            "dbids": ["DB16627;DB01234", "DB08910;DB01234"]
        })
        
    p_vals, comparisons = [], []

    # For all outcomes
    for serious_or_other, rec in zip(*[
        ["serious"] * len(sample.get("serious_adverse_events", [])) + ["other"] * len(sample.get("other_adverse_events", [])),
        sample.get("serious_adverse_events", []) + sample.get("other_adverse_events", []),
    ]):
        # skip low-incidence AEs (at least one arm should have enough patients affected)
        if not any(float(s["percentage"].strip("%")) >= detectable_threshold for s in rec["stats"] if "percentage" in s.keys()): continue
        if not any(int(s["numAffected"]) >= num_affected_threshold for s in rec["stats"] if "numAffected" in s.keys()): continue

        affected = np.array([int(s["numAffected"]) if "numAffected" in s.keys() else 0 for s in rec["stats"]])
        at_risk = np.array([int(s["numAtRisk"]) if "numAtRisk" in s.keys() else 0  for s in rec["stats"]])
        unaffected = at_risk - affected

        for i, j in combinations(range(len(rec["stats"])), 2):
            # decide whether to skip this comparison based on per-arm sample size thresholds
            esc = False
            for group in groups:
                if rec["stats"][i]["groupId"] == group["id"]:
                    group_title_i = group["title"]
                    if serious_or_other == "serious":
                        if "seriousNumAtRisk" in group and group["seriousNumAtRisk"] < per_arm_threshold:
                            esc = True
                    else:
                        if "otherNumAtRisk" in group and group["otherNumAtRisk"] < per_arm_threshold:
                            esc = True
                if rec["stats"][j]["groupId"] == group["id"]:
                    group_title_j = group["title"]
                    if serious_or_other == "serious":
                        if "seriousNumAtRisk" in group and group["seriousNumAtRisk"] < per_arm_threshold:
                            esc = True
                    else:
                        if "otherNumAtRisk" in group and group["otherNumAtRisk"] < per_arm_threshold:
                            esc = True
            if esc: continue
            
            # # decide whether to skip this comparison based on whether the combo is in design table
            # if not set([groups[i]["title"], groups[j]["title"]]).issubset(title2dbid["title"].values.tolist()):
            #     continue
            # NOTE: the above is commented because there are many trials whose titles are not consistent with the design table. instead, we manually check the titles
            if nct_id in limit_groups_nct_ids.keys():
                if not sum([((pair[0] == group_title_i) and (pair[1] == group_title_j)) or ((pair[1] == group_title_i) and (pair[0] == group_title_j)) for pair in limit_groups_nct_ids[nct_id]]) > 0:
                    continue
                elif sum([((pair[0] == group_title_i) and (pair[1] == group_title_j)) or ((pair[1] == group_title_i) and (pair[0] == group_title_j)) for pair in limit_groups_nct_ids[nct_id]]) > 1:
                    raise Exception
            
            table = np.array([[affected[i],   affected[j]],
                            [unaffected[i], unaffected[j]]])
            prior_odds, p = fisher_exact(table, alternative="two-sided")
            odds = odds_ratio(table).statistic
            p_vals.append(p)
            comparisons.append((rec["term"], 
                                serious_or_other,
                                group_title_i, 
                                group_title_j, 
                                groups[i].get("description", ""),
                                groups[j].get("description", ""),
                                affected[i],
                                affected[j],
                                at_risk[i],
                                at_risk[j],
                                odds))

    # FWER-control with Bonferroni (or switch to 'fdr_bh' for FDR)
    reject, adj_p, _, _ = multipletests(p_vals, alpha=alpha, method="bonferroni")

    significant_aes = {}
    for (term, serious_or_other, arm_i, arm_j, desc_i, desc_j, affected_i, affected_j, at_risk_i, at_risk_j, odds), p_raw, p_corr, keep in zip(comparisons, p_vals, adj_p, reject):
        if keep:
            significant_aes.setdefault((arm_i, arm_j), {})
            significant_aes[(arm_i, arm_j)].setdefault(term, []).append({
                # "pair"      : f"{arm_i} vs {arm_j}",
                "serious_or_other": serious_or_other,
                "arm_i"    : arm_i,
                "arm_j"    : arm_j,
                "description_i": desc_i,
                "description_j": desc_j,
                "affected_i": affected_i,
                "affected_j": affected_j,
                "at_risk_i" : at_risk_i,
                "at_risk_j" : at_risk_j,
                "odds_ratio": odds,
                "p_raw"     : p_raw,
                "p_corr"    : p_corr,
            })
                    
    significant_aes_dfs[nct_id] = {}
    if nct_id in design_groups_multi_combo_same_trial.index.values or (nct_id in {"NCT00575588", "NCT03151811"}):
        for pair, dct in significant_aes.items():
            df = pd.DataFrame([
                {"term": term, **entry}
                for term, lst in dct.items()
                for entry in lst
            ]).merge(
                title2dbid, left_on="arm_i", right_on="title", how="left"
            ).drop(columns=["title"]).rename(columns={"dbids":"arm_i_dbids"}).merge(
                title2dbid, left_on="arm_j", right_on="title", how="left"
            ).drop(columns=["title"]).rename(columns={"dbids":"arm_j_dbids"})
            significant_aes_dfs[nct_id][pair] = df
    else:
        print(nct_id, "not found in design_groups_multi_combo_same_trial")
        for pair, dct in significant_aes.items():
            df = pd.DataFrame([
                {"term": term, **entry}
                for term, lst in dct.items()
                for entry in lst
            ])
            significant_aes_dfs[nct_id][pair] = df

In [None]:
import pickle
pickle.dump({
    "ae_results": ae_results,
    "significant_aes_dfs": significant_aes_dfs,
    "trials_metadata_filtered_multi_combo_same_trial": trials_metadata_filtered_multi_combo_same_trial,
    "design_groups_multi_combo_same_trial": design_groups_multi_combo_same_trial,
    "conditions_multi_combo_same_trial": conditions_multi_combo_same_trial,
}, open("ctg_20.pkl", "wb"))

## Check those trial AEs

In [8]:
import pickle

temp = pickle.load(open("ctg_20.pkl", "rb"))
ae_results = temp["ae_results"]
significant_aes_dfs = temp["significant_aes_dfs"]
trials_metadata_filtered_multi_combo_same_trial = temp["trials_metadata_filtered_multi_combo_same_trial"]
design_groups_multi_combo_same_trial = temp["design_groups_multi_combo_same_trial"]
conditions_multi_combo_same_trial = temp["conditions_multi_combo_same_trial"]

In [9]:
significant_aes_dfs = {
    k: v
    for k, v in significant_aes_dfs.items()
    if len(v) > 0
}
print(len(significant_aes_dfs))
print(significant_aes_dfs.keys())

35
dict_keys(['NCT00069121', 'NCT00660907', 'NCT00701090', 'NCT00968812', 'NCT01568866', 'NCT00363415', 'NCT04163900', 'NCT00417079', 'NCT00045162', 'NCT01106677', 'NCT01808573', 'NCT00926796', 'NCT02049814', 'NCT02491983', 'NCT00191152', 'NCT00379821', 'NCT00520676', 'NCT02847494', 'NCT01522976', 'NCT01289119', 'NCT00308750', 'NCT00325234', 'NCT00171054', 'NCT03170882', 'NCT00236899', 'NCT01013740', 'NCT03887130', 'NCT02684058', 'NCT03382600', 'NCT02599324', 'NCT01527487', 'NCT01822548', 'NCT02463331', 'NCT00575588', 'NCT03151811'])


In [6]:
title2dbids = design_groups_multi_combo_same_trial[["title", "drugbank_identifier_in"]].explode(["title", "drugbank_identifier_in"]).rename(columns={"drugbank_identifier_in": "dbids"}).reset_index().drop_duplicates().set_index("nct_id").sort_index()
title2dbids.head(5)

Unnamed: 0_level_0,title,dbids
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00004259,Pilot Arm #1: RT+TMZ+BCNU,DB00262;DB00853
NCT00004259,Pilot Arm #2: RT+TMZ+BCNU,DB00262;DB00853
NCT00004259,RT + BCNU/CCNU,DB00262;DB01206
NCT00033657,Paclitaxel / Cisplatin / Radiation therapy (Ar...,DB01229;DB04620
NCT00033657,Cisplatin / Irinotecan / Radiation therapy (Ar...,DB00762;DB04620


Manually map titles in CDCDB to titles in CTG

In [7]:
arm2title = {
    "NCT01568866": {
        "Bortezomib": "Bortezomib plus Dexamethasone",
        "Carfilzomib": "Carfilzomib plus Dexamethasone",
    },
    "NCT00045162": {
        "Cisplatin + Etoposide": "2",
        "Cisplatin + Irinotecan": "1",
    },
    "NCT00069121": {
        "5-FU/LV MAYO CLINIC": '5-Fluorouracil/Leucovorin (5-FU/LV)',
        "XELOX": 'Capecitabine in Combination with Oxaliplatin (XELOX)',
        "5-FU/LV ROSWELL PARK": '5-Fluorouracil/Leucovorin (5-FU/LV)',
    },
    "NCT00660907": {
        'Dapagliflozin Plus Metformin': '1',
        'Glipizide Plus Metformin': '2',
    },
    "NCT00701090": {
        "Sitagliptin": "1",
        "Glimepiride": "2",
    },
    "NCT00968812": {
        "Canagliflozin 100 mg: Baseline to Week 104": "Canagliflozin 100 mg",
        "Glimepiride: Baseline to Week 104": "Glimepiride",
    },
    "NCT00363415": {
        "Pemetrexed + Carboplatin": 'A',
        "Etoposide + Carboplatin": 'B',
    },
    "NCT04163900": {
        "A - NUC-1031 and Cisplatin": "A - NUC-1031 and cisplatin",
        "B - Gemcitabine and Cisplatin": "B - gemcitabine and cisplatin",
    },
    "NCT01106677": {
        "Canagliflozin 100 mg: Baseline to Week 52": "Canagliflozin 100 mg",
        "Sitagliptin 100mg: Baseline to Week 52": "Sitagliptin 100 mg",
    },
    "NCT00191152": {
        "Gemcitabine Plus Docetaxel": "Gemcitabine + Docetaxel",
        "Docetaxel Plus Capecitabine": "Capecitabine + Docetaxel",
    },
    "NCT00520676": {
        "Pemetrexed Plus Carboplatin": "pemetrexed plus carboplatin",
        "Docetaxel Plus Carboplatin": "docetaxel plus carboplatin",
    },
    "NCT00236899":{
        "Arm A: Docetaxel and Gemcitabine (Tri-weekly)": "A: Docetaxel and Gemcitabine (Tri-weekly)",
        "Arm B: Paclitaxel and Gemcitabine (Tri-weekly)": "B: Paclitaxel and Gemcitabine (Tri-weekly)",
    },
    "NCT01013740": {
        'Randomized Phase Lapatinib 1250mg QD + Capecitabine 2000mg/m2': "Lapatinib + Capecitabine",
        "Randomized Phase Lapatinib 1250mg QD + Vinorelbine 20mg/m2": "Lapatinib + Vinorelbine",
    },
    "NCT02684058": {
        "LGG Cohort: Dabrafenib and Trametinib (On-treatment)": "LGG cohort: dabrafenib and trametinib",
        "LGG Cohort: Carboplatin and Vincristine (On-treatment)": "LGG cohort: carboplatin and vincristine",
    },
    "NCT01808573": {
        "Neratinib Plus Capecitabine": "neratinib plus capecitabine",
        "Lapatinib Plus Capecitabine": "lapatinib plus capecitabine",
    },
    "NCT00926796": {
        "Regimen A: Gentamicin Plus Azithromycin": "Regimen A: gentamicin plus azithromycin",
        "Regimen B: Gemifloxacin Plus Azithromycin": "Regimen B: gemifloxacin plus azithromycin",
    },
    "NCT00379821": {
        "Chloroquine Plus Artesunate": "CQ plus artesunate",
        "CQ Plus Azithromycin": "CQ plus azithromycin",
    },
    "NCT01522976": {
        "Arm 1: Azacitidine/Lenalidomide": "Arm I (azacitidine and lenalidomide)",
        "Arm 3: Azacitidine/Vorinostat": "Arm III (azacitidine and vorinostat)",
    },
    "NCT03887130": {
        "Vinorelbine-Capecitabine (Arm A)": "Vinorelbine-Capecitabine (arm A)",
        "Gemcitabine-Paclitaxel (Arm B)": "Gemcitabine-Paclitaxel (arm B)",
        "Gemcitabine-Docetaxel (Arm C)": "Gemcitabine-Docetaxel (arm C)",
    },
    "NCT02599324": {
        "Cohort 1 (RCC) Phase 2: Ibrutinib 840 mg + Everolimus": "Cohort 1: Renal Cell Carcinoma (RCC)",
        "Cohort 2 (UC) Phase 2: Ibrutinib 840 mg + Paclitaxel": "Cohort 2: Urothelial Carcinoma (UC)",
    },
    "NCT01822548": {
        "Vildagliptin & Metformin": "Vildagliptin & metformin",
        "Glibenclamide & Metformin": "Glibenclamide & metformin",
    },
    "NCT02463331": {
        "Chloroquine Plus Prednisone": "chloroquine plus prednisone",
        "Azathioprine Plus Prednisone": "azathioprine plus prednisone",
    },
}

## Get scores

In [8]:
import json
outcome_mapper = json.load(open("../outcome_mapper.json", "r"))
outcome_inds_mapper = {
    ae_dfci: {source: ([drugbank_ddi_classes.tolist().index(ae_db) for ae_db in aes_db] if source == "drugbank" else [twosides_ddi_classes.tolist().index(ae_db) for ae_db in aes_db]) for source, aes_db in dct.items()} 
    for ae_dfci, dct in outcome_mapper.items()
}

In [9]:
def post_process(temp):
    temp["i_or_j_or"] = temp["odds_ratio"].apply(lambda x: "i" if x > 1 else "j" if x < 1 else "both")
    temp["i_or_j_closest_twosides"] = temp[["closest_twosides_arm_i", "closest_twosides_arm_j"]].apply(
        lambda row: "i" if row["closest_twosides_arm_i"] > row["closest_twosides_arm_j"] else "j" if row["closest_twosides_arm_i"] < row["closest_twosides_arm_j"] else "both" if (row["closest_twosides_arm_i"] == row["closest_twosides_arm_j"]) and (row["closest_twosides_arm_i"] == row["closest_twosides_arm_i"]) else np.nan,
        axis=1
    )
    temp["i_or_j_closest_drugbank"] = temp[["closest_drugbank_raw_score_arm_i", "closest_drugbank_raw_score_arm_j"]].apply(
        lambda row: "i" if row["closest_drugbank_raw_score_arm_i"] > row["closest_drugbank_raw_score_arm_j"] else "j" if row["closest_drugbank_raw_score_arm_i"] < row["closest_drugbank_raw_score_arm_j"] else "both" if (row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_j"]) and (row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_i"]) else np.nan,
        axis=1
    )

In [10]:
import re
term_map = {
    # spelling / simple duplicates
    r"\banaemia\b"                       : "anemia",
    r"\bhypoglycaemia\b"                 : "hypoglycemia",
    r"\bdiarrhea_without_colostomy\b"    : "diarrhoea",

    # neutrophil lineage – comment out any line to keep it separate
    r"\bneutropenia/granulocytopenia\b"  : "neutropenia",
    r"\bfebrile_neutropenia\b"           : "neutropenia",
    r"\binfection_with_3-4_neutropenia\b": "neutropenia",
    r"\bleukopenia\b"                    : "neutropenia",

    # platelet lineage
    r"\bplatelet_count_decreased\b"      : "thrombocytopenia",   # keep raw lab?
    r"\bplatelet_count\b"                : "thrombocytopenia", # toggle if you want to merge

    # peripheral sensory neuropathy
    r"\b(polyneuropathy|peripheral_sensory_neuropathy|"
    r"paraesthesia|dysaesthesia|hypoaesthesia|"
    r"dysaesthesia_pharynx|neuralgia)\b" : "neuropathy_peripheral",

    # injection-site → infusion reaction
    r"\binjection_site_(pain|reaction)\b": "infusion_related_reaction",

    # rash family
    r"\brash_maculo-papular\b"           : "rash",
    r"\brash/desquamation\b"             : "rash",

    # fatigue cluster
    r"\bfatigue/malaise/lethargy\b"      : "fatigue",
    
    # hyperbilirubinemia
    r"\bblood_bilirubin_increased\b"     : "hyperbilirubinemia",
}
regex = re.compile("|".join(term_map.keys()))
count_dct = {}
trial_id_dct = {}

for nct_id in significant_aes_dfs.keys():
    for i in range(len(list(significant_aes_dfs[nct_id].keys()))):
        test_df = significant_aes_dfs[nct_id][list(significant_aes_dfs[nct_id].keys())[i]].copy()
        test_df["term"] = (
            test_df["term"]
                .str.lower()
                .str.replace(" ", "_")
                .replace(term_map, regex=True)   # <-- pandas handles the looping
        )
        for t in test_df["term"].unique():
            if t not in count_dct:
                count_dct[t] = 0
            count_dct[t] += 1
            trial_id_dct.setdefault(t, set()).add(nct_id)

In [13]:
selected_outcomes = [ae for ae, count in count_dct.items() if count >= 5]

all_sig_recs = []
for nct_id in significant_aes_dfs.keys():
    print(nct_id)
    print(list(significant_aes_dfs[nct_id].keys()))
    for i in range(len(list(significant_aes_dfs[nct_id].keys()))):
        test_df = significant_aes_dfs[nct_id][list(significant_aes_dfs[nct_id].keys())[i]].copy()
        test_df["term_old"] = test_df["term"].copy()
        test_df["term"] = test_df["term"].str.lower().str.replace(" ", "_").str.replace("anaemia", "anemia").str.replace("diarrhea_without_colostomy", "diarrhoea").str.replace("neutropenia/granulocytopenia", "neutropenia").str.replace("hypoglycaemia", "hypoglycemia").str.replace("platelet_count", "platelet_count_decreased").replace("peripheral_sensory_neuropathy", "neuropathy_peripheral")
        if (len(set(selected_outcomes) & set(test_df["term"].values)) == 0):
            continue
        if (test_df["arm_i_dbids"].isna().sum() > 0) and (test_df["arm_j_dbids"].isna().sum() > 0):
            test_df["arm_i_dbids"] = test_df["arm_i"].apply(lambda title: title2dbids.loc[nct_id].set_index("title").to_dict()["dbids"][arm2title[nct_id][title]])
            test_df["arm_j_dbids"] = test_df["arm_j"].apply(lambda title: title2dbids.loc[nct_id].set_index("title").to_dict()["dbids"][arm2title[nct_id][title]])
        test_df["arm_i_names"] = test_df["arm_i_dbids"].apply(lambda dbids: tuple([drug_metadata.loc[drug_metadata["node_id"] == dbid, "node_name"].values[0] for dbid in dbids.split(";")]))
        test_df["arm_j_names"] = test_df["arm_j_dbids"].apply(lambda dbids: tuple([drug_metadata.loc[drug_metadata["node_id"] == dbid, "node_name"].values[0] for dbid in dbids.split(";")]))
        test_df = test_df.drop(columns=["description_i", "description_j"]).query("term in @selected_outcomes")
        
        drug_names = np.unique(sum(test_df[["arm_i_names", "arm_j_names"]].iloc[0].values.flatten(), start=tuple()))
        drug_inds = [drug_metadata[drug_metadata["node_name"] == drug_name].index.values[0] for drug_name in drug_names]

        arm_i_drug_1_ind_ind = drug_names.tolist().index(test_df["arm_i_names"].iloc[0][0])
        arm_i_drug_2_ind_ind = drug_names.tolist().index(test_df["arm_i_names"].iloc[0][1])
        arm_j_drug_1_ind_ind = drug_names.tolist().index(test_df["arm_j_names"].iloc[0][0])
        arm_j_drug_2_ind_ind = drug_names.tolist().index(test_df["arm_j_names"].iloc[0][1])

        aes = test_df["term"].tolist()

        # Suppress both prints and warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            with contextlib.redirect_stdout(io.StringIO()):
                ctg_test_twosides_scores, _ = get_twosides_scores(
                    outcome_twosides_inds=sum([[o for o in outcome_inds_mapper[ae]["twosides"] if o not in {twosides_ddi_classes.tolist().index("Adverse event"), twosides_ddi_classes.tolist().index("Adverse drug reaction")}] for ae in aes if ae in outcome_inds_mapper], start=[]), 
                    drug_inds=drug_inds, 
                    drug_group_str="ctg_test",
                )
                
        ctg_test_drugbank_scores = get_drugbank_scores(
            outcome_drugbank_inds=sum([[o for o in outcome_inds_mapper[ae]["drugbank"] if o not in {drugbank_ddi_classes.tolist().index("adverse effects, increase"), drugbank_ddi_classes.tolist().index("absorption, increase | serum level, increase | adverse effects, increase")}] for ae in aes if ae in outcome_inds_mapper], start=[]), 
            drug_inds=drug_inds, 
        )

        ctg_test_twosides_scores_df = pd.DataFrame(ctg_test_twosides_scores[700][:, [arm_i_drug_1_ind_ind, arm_j_drug_1_ind_ind], [arm_i_drug_2_ind_ind, arm_j_drug_2_ind_ind]], index=sum([[o for o in outcome_mapper[ae]["twosides"] if o not in {"Adverse event", "Adverse drug reaction"}] for ae in aes if ae in outcome_inds_mapper], start=[]), columns=[tuple(np.array(drug_names)[[arm_i_drug_1_ind_ind, arm_i_drug_2_ind_ind]]), tuple(np.array(drug_names)[[arm_j_drug_1_ind_ind, arm_j_drug_2_ind_ind]])]).drop_duplicates()
        ctg_test_drugbank_scores_df = pd.DataFrame(ctg_test_drugbank_scores[:, [arm_i_drug_1_ind_ind, arm_j_drug_1_ind_ind], [arm_i_drug_2_ind_ind, arm_j_drug_2_ind_ind]], index=sum([[o for o in outcome_mapper[ae]["drugbank"] if o not in {"adverse effects, increase", "absorption, increase | serum level, increase | adverse effects, increase"}] for ae in aes if ae in outcome_inds_mapper], start=[]), columns=[tuple(np.array(drug_names)[[arm_i_drug_1_ind_ind, arm_i_drug_2_ind_ind]]), tuple(np.array(drug_names)[[arm_j_drug_1_ind_ind, arm_j_drug_2_ind_ind]])]).drop_duplicates()

        temp = test_df.drop(columns=["p_raw", "p_corr", "arm_i_dbids", "arm_j_dbids"]).query("term in @outcome_inds_mapper.keys()")
        temp["closest_twosides_name"] = temp["term"].apply(lambda x: outcome_mapper[x]["twosides"][0] if len(outcome_mapper[x]["twosides"]) > 0 else np.nan)
        temp["closest_twosides_arm_i"] = temp[["closest_twosides_name", "arm_i_names"]].apply(
            lambda row:
            ctg_test_twosides_scores_df.at[row["closest_twosides_name"], row["arm_i_names"]]
            if row["closest_twosides_name"] == row["closest_twosides_name"] else np.nan,
            axis=1
        )
        temp["closest_twosides_arm_j"] = temp[["closest_twosides_name", "arm_j_names"]].apply(
            lambda row:
            ctg_test_twosides_scores_df.at[row["closest_twosides_name"], row["arm_j_names"]]
            if row["closest_twosides_name"] == row["closest_twosides_name"] else np.nan,
            axis=1
        )
        temp["closest_drugbank_name"] = temp["term"].apply(lambda x: outcome_mapper[x]["drugbank"][0] if len(outcome_mapper[x]["drugbank"]) > 0 else np.nan)
        temp["closest_drugbank_raw_score_arm_i"] = temp[["closest_drugbank_name", "arm_i_names"]].apply(
            lambda row:
            ctg_test_drugbank_scores_df.at[row["closest_drugbank_name"], row["arm_i_names"]]
            if row["closest_drugbank_name"] == row["closest_drugbank_name"] else np.nan,
            axis=1
        )
        temp["closest_drugbank_raw_score_arm_j"] = temp[["closest_drugbank_name", "arm_j_names"]].apply(
            lambda row:
            ctg_test_drugbank_scores_df.at[row["closest_drugbank_name"], row["arm_j_names"]]
            if row["closest_drugbank_name"] == row["closest_drugbank_name"] else np.nan,
            axis=1
        )
        
        post_process(temp)
        temp["nct_id"] = nct_id
        
        all_sig_recs.append(temp)
    

NCT00069121
[('5-FU/LV MAYO CLINIC', 'XELOX'), ('5-FU/LV ROSWELL PARK', 'XELOX')]
NCT00660907
[('Dapagliflozin Plus Metformin', 'Glipizide Plus Metformin')]
NCT00701090
[('Sitagliptin', 'Glimepiride')]
NCT00968812
[('Canagliflozin 100 mg: Baseline to Week 104', 'Glimepiride: Baseline to Week 104')]
NCT01568866
[('Bortezomib', 'Carfilzomib')]
NCT00363415
[('Pemetrexed + Carboplatin', 'Etoposide + Carboplatin')]
NCT04163900
[('A - NUC-1031 and Cisplatin', 'B - Gemcitabine and Cisplatin')]
NCT00417079
[('Mitoxantrone + Prednisone', 'Cabazitaxel + Prednisone')]
NCT00045162
[('Cisplatin + Irinotecan', 'Cisplatin + Etoposide')]
NCT01106677
[('Canagliflozin 100 mg: Baseline to Week 52', 'Sitagliptin 100mg: Baseline to Week 52')]
NCT01808573
[('Neratinib Plus Capecitabine', 'Lapatinib Plus Capecitabine')]
NCT00926796
[('Regimen A: Gentamicin Plus Azithromycin', 'Regimen B: Gemifloxacin Plus Azithromycin')]
NCT02049814
[('Metformin + Voglibose 0.2 mg', 'Metformin + Acarbose 50 mg')]
NCT02491983

In [None]:
all_sig_recs = pd.concat(all_sig_recs, axis=0)
all_sig_recs.to_pickle("all_sig_recs_20.pkl")

## Examine predictions vs CT data

In [3]:
all_sig_recs = pd.read_pickle("all_sig_recs_20.pkl")
all_sig_recs.shape[0], all_sig_recs["nct_id"].nunique()

(42, 21)

In [16]:
all_sig_recs.reset_index(drop=True)

Unnamed: 0,term,serious_or_other,arm_i,arm_j,affected_i,affected_j,at_risk_i,at_risk_j,odds_ratio,term_old,...,closest_twosides_name,closest_twosides_arm_i,closest_twosides_arm_j,closest_drugbank_name,closest_drugbank_raw_score_arm_i,closest_drugbank_raw_score_arm_j,i_or_j_or,i_or_j_closest_twosides,i_or_j_closest_drugbank,nct_id
0,neuropathy_peripheral,other,5-FU/LV MAYO CLINIC,XELOX,8,279,657,938,0.029156,NEUROPATHY PERIPHERAL,...,Neuropathy peripheral,0.582891,0.265658,,,,j,i,,NCT00069121
1,neuropathy_peripheral,other,5-FU/LV MAYO CLINIC,XELOX,4,152,657,938,0.031713,PERIPHERAL SENSORY NEUROPATHY,...,Neuropathy peripheral,0.582891,0.265658,,,,j,i,,NCT00069121
2,alopecia,other,5-FU/LV MAYO CLINIC,XELOX,159,40,657,938,7.158781,ALOPECIA,...,Alopecia,0.351339,0.314284,,,,i,i,,NCT00069121
3,thrombocytopenia,other,5-FU/LV MAYO CLINIC,XELOX,2,167,657,938,0.014115,THROMBOCYTOPENIA,...,Thrombocytopenia,0.877121,0.736355,"thrombocytopenia, increase",0.001079799,0.01814768,j,i,j,NCT00069121
4,neuropathy_peripheral,other,5-FU/LV ROSWELL PARK,XELOX,12,279,269,938,0.110423,NEUROPATHY PERIPHERAL,...,Neuropathy peripheral,0.582891,0.265658,,,,j,i,,NCT00069121
5,neuropathy_peripheral,other,5-FU/LV ROSWELL PARK,XELOX,11,152,269,938,0.220665,PERIPHERAL SENSORY NEUROPATHY,...,Neuropathy peripheral,0.582891,0.265658,,,,j,i,,NCT00069121
6,neutropenia,other,5-FU/LV ROSWELL PARK,XELOX,36,260,269,938,0.403169,NEUTROPENIA,...,Neutropenia,0.906724,0.748297,"neutropenia, increase",0.003514237,0.4780329,j,i,j,NCT00069121
7,thrombocytopenia,other,5-FU/LV ROSWELL PARK,XELOX,4,167,269,938,0.069768,THROMBOCYTOPENIA,...,Thrombocytopenia,0.877121,0.736355,"thrombocytopenia, increase",0.001079799,0.01814768,j,i,j,NCT00069121
8,hypoglycemia,other,Dapagliflozin Plus Metformin,Glipizide Plus Metformin,14,162,406,408,0.05441,Hypoglycemia,...,Hypoglycaemia,0.05041,0.828699,"hypoglycemia, increase",0.9498105,0.9447691,j,j,i,NCT00660907
9,hypoglycemia,other,Sitagliptin,Glimepiride,36,114,519,518,0.264464,Hypoglycaemia,...,Hypoglycaemia,0.737701,0.790389,"hypoglycemia, increase",0.9127291,0.9628906,j,j,j,NCT00701090


Remove duplicated rows (same entity different names, serious/other)

In [17]:
all_sig_recs_filtered = all_sig_recs.reset_index(drop=True).drop(index=[1, 4, 5, 7, 13, 19])  # NOTE: because they are repeated (serious & other, different names but same entities, etc.)
all_sig_recs_filtered = all_sig_recs_filtered[all_sig_recs_filtered["term"].isin(all_sig_recs_filtered["term"].value_counts()[all_sig_recs_filtered["term"].value_counts() >= 5].index.values)]

Calculate compound score: Compare using DrugBank model scores --> if difference > 0.1 then report; else compare TWOSIDES model scores

In [18]:
all_sig_recs_filtered["i_or_j_closest_drugbank_then_closest_twosides"] = all_sig_recs_filtered[["closest_drugbank_raw_score_arm_i", "closest_drugbank_raw_score_arm_j", "closest_twosides_arm_i", "closest_twosides_arm_j"]].apply(
    lambda row: 
        "i" if 
        ((row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_i"]) and (row["closest_drugbank_raw_score_arm_i"] >= row["closest_drugbank_raw_score_arm_j"]+0.1)) or 
        ((row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_i"]) and (row["closest_drugbank_raw_score_arm_i"] > row["closest_drugbank_raw_score_arm_j"]-0.1) and (row["closest_twosides_arm_i"] > row["closest_twosides_arm_j"])) or
        ((row["closest_drugbank_raw_score_arm_i"] != row["closest_drugbank_raw_score_arm_i"]) and (row["closest_twosides_arm_i"] > row["closest_twosides_arm_j"]))
        else "j" if 
        ((row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_i"]) and (row["closest_drugbank_raw_score_arm_i"]+0.1 <= row["closest_drugbank_raw_score_arm_j"])) or 
        ((row["closest_drugbank_raw_score_arm_i"] == row["closest_drugbank_raw_score_arm_i"]) and (row["closest_drugbank_raw_score_arm_i"]-0.1 < row["closest_drugbank_raw_score_arm_j"]) and (row["closest_twosides_arm_i"] < row["closest_twosides_arm_j"])) or
        ((row["closest_drugbank_raw_score_arm_i"] != row["closest_drugbank_raw_score_arm_i"]) and (row["closest_twosides_arm_i"] < row["closest_twosides_arm_j"]))
        else np.nan, 
    axis=1
)

In [19]:
ae = "neutropenia"
all_sig_recs_filtered.query("term == @ae")[["term", "term_old", "nct_id", "arm_i", "arm_j", "odds_ratio", "i_or_j_or", "i_or_j_closest_drugbank_then_closest_twosides"]].round(2)

Unnamed: 0,term,term_old,nct_id,arm_i,arm_j,odds_ratio,i_or_j_or,i_or_j_closest_drugbank_then_closest_twosides
6,neutropenia,NEUTROPENIA,NCT00069121,5-FU/LV ROSWELL PARK,XELOX,0.4,j,j
14,neutropenia,Neutropenia,NCT00363415,Pemetrexed + Carboplatin,Etoposide + Carboplatin,0.18,j,j
17,neutropenia,Neutropenia,NCT04163900,A - NUC-1031 and Cisplatin,B - Gemcitabine and Cisplatin,0.55,j,j
20,neutropenia,Neutropenia/granulocytopenia,NCT00045162,Cisplatin + Irinotecan,Cisplatin + Etoposide,0.34,j,i
25,neutropenia,Neutropenia,NCT00191152,Gemcitabine Plus Docetaxel,Docetaxel Plus Capecitabine,9.14,i,i
27,neutropenia,Neutropenia,NCT00520676,Pemetrexed Plus Carboplatin,Docetaxel Plus Carboplatin,0.26,j,j
29,neutropenia,Neutropenia,NCT03170882,Pomalidomide 4 mg + Dexamethasone 40 mg,Ixazomib 4 mg + Dexamethasone 20 mg,50.91,i,i
30,neutropenia,Neutropenia,NCT00236899,Arm A: Docetaxel and Gemcitabine (Tri-weekly),Arm B: Paclitaxel and Gemcitabine (Tri-weekly),4.82,i,j
31,neutropenia,Neutropenia,NCT01013740,Randomized Phase Lapatinib 1250mg QD + Capecit...,Randomized Phase Lapatinib 1250mg QD + Vinorel...,0.11,j,j


In [21]:
ae = "anemia"
all_sig_recs_filtered.query("term == @ae")[["term", "term_old", "nct_id", "arm_i", "arm_j", "odds_ratio", "i_or_j_or", "i_or_j_closest_drugbank_then_closest_twosides"]].round(2)

Unnamed: 0,term,term_old,nct_id,arm_i,arm_j,odds_ratio,i_or_j_or,i_or_j_closest_drugbank_then_closest_twosides
15,anemia,ANAEMIA,NCT01568866,Bortezomib,Carfilzomib,0.54,j,i
20,anemia,Anaemia,NCT04163900,A - NUC-1031 and Cisplatin,B - Gemcitabine and Cisplatin,0.37,j,j
35,anemia,Anaemia,NCT00191152,Gemcitabine Plus Docetaxel,Docetaxel Plus Capecitabine,2.57,i,i
49,anemia,Anaemia,NCT02684058,LGG Cohort: Dabrafenib and Trametinib (On-trea...,LGG Cohort: Carboplatin and Vincristine (On-tr...,0.16,j,j
55,anemia,Anaemia,NCT03151811,Arm A: Melflufen+Dexamethasone,Arm B: Pomalidomide+Dexamethasone,3.19,i,i


In [22]:
ae = "alopecia"
all_sig_recs_filtered.query("term == @ae")[["term", "term_old", "nct_id", "arm_i", "arm_j", "odds_ratio", "i_or_j_or", "i_or_j_closest_drugbank_then_closest_twosides"]].round(2)

Unnamed: 0,term,term_old,nct_id,arm_i,arm_j,odds_ratio,i_or_j_or,i_or_j_closest_drugbank_then_closest_twosides
4,alopecia,ALOPECIA,NCT00069121,5-FU/LV MAYO CLINIC,XELOX,7.16,i,i
19,alopecia,Alopecia,NCT00363415,Pemetrexed + Carboplatin,Etoposide + Carboplatin,0.12,j,j
32,alopecia,Alopecia,NCT00045162,Cisplatin + Irinotecan,Cisplatin + Etoposide,0.42,j,j
40,alopecia,Alopecia,NCT00520676,Pemetrexed Plus Carboplatin,Docetaxel Plus Carboplatin,0.14,j,j
45,alopecia,Alopecia,NCT03887130,Vinorelbine-Capecitabine (Arm A),Gemcitabine-Paclitaxel (Arm B),0.11,j,j
47,alopecia,Alopecia,NCT03887130,Vinorelbine-Capecitabine (Arm A),Gemcitabine-Docetaxel (Arm C),0.08,j,j
50,alopecia,Alopecia,NCT02684058,LGG Cohort: Dabrafenib and Trametinib (On-trea...,LGG Cohort: Carboplatin and Vincristine (On-tr...,0.08,j,j
52,alopecia,ALOPECIA,NCT02599324,Cohort 1 (RCC) Phase 2: Ibrutinib 840 mg + Eve...,Cohort 2 (UC) Phase 2: Ibrutinib 840 mg + Pacl...,0.0,j,i


In [23]:
ae = "hypoglycemia"
all_sig_recs_filtered.query("term == @ae")[["term", "term_old", "nct_id", "arm_i", "arm_j", "odds_ratio", "i_or_j_or", "i_or_j_closest_drugbank_then_closest_twosides"]].round(2)

Unnamed: 0,term,term_old,nct_id,arm_i,arm_j,odds_ratio,i_or_j_or,i_or_j_closest_drugbank_then_closest_twosides
12,hypoglycemia,Hypoglycemia,NCT00660907,Dapagliflozin Plus Metformin,Glipizide Plus Metformin,0.05,j,j
13,hypoglycemia,Hypoglycaemia,NCT00701090,Sitagliptin,Glimepiride,0.26,j,j
14,hypoglycemia,Hypoglycaemia,NCT00968812,Canagliflozin 100 mg: Baseline to Week 104,Glimepiride: Baseline to Week 104,0.17,j,j
31,hypoglycemia,Hypoglycemia,NCT00045162,Cisplatin + Irinotecan,Cisplatin + Etoposide,0.0,j,j
53,hypoglycemia,hypoglycemia,NCT01822548,Vildagliptin & Metformin,Glibenclamide & Metformin,0.0,j,j
54,hypoglycemia,Hypoglycaemia,NCT00575588,Saxagliptin + Metformin,Glipizide + Metformin,0.06,j,j
