## Code prepration

### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json
import os
import sys
from tqdm import tqdm, tqdm_notebook
# import sibling package
sys.path.insert(0, os.path.abspath('..'))
from drug_identfiers_resolver.Identifiers_converter import Identifiers_converter
%matplotlib inline
tqdm.pandas(tqdm_notebook)

  from pandas import Panel


### Define functions for the notebook

In [2]:
def get_drug_identifier_single(id_converter, interventions, cache):
    main_name = interventions[0]
    identifier = cache.get(main_name, -1)
    if identifier != -1:
        return identifier
    identifier = id_converter._retrieve_from_drugbank(main_name)
    cache[main_name] = identifier
    if identifier is not None:
        return identifier
    else:
        other_names = interventions[1]
        if other_names is None:
            # Can't find identifier for all the drugs in this drugs comb
            return None
        found = False
        for name in other_names:
            identifier = cache.get(name, -1)
            if identifier != -1:
                return identifier
                break
            identifier = id_converter._retrieve_from_drugbank(name)
            cache[name] = identifier
            if identifier is not None:
                return identifier
        return None

def get_drug_identifier(id_converter, interventions, cache):
    result = []
    for int_with_other_names in interventions:
        main_name = int_with_other_names[0]
        identifier = cache.get(main_name, None)
        if identifier is not None:
            result.append(identifier)
            continue
        identifier = id_converter._retrieve_from_drugbank(main_name)
        if identifier is not None:
            cache[main_name] = identifier
            result.append(identifier)
        else:
            other_names = int_with_other_names[1]
            if other_names is None:
                # Can't find identifier for all the drugs in this drugs comb
                return None
            found = False
            for name in other_names:
                identifier = cache.get(name, None)
                if identifier is not None:
                    result.append(identifier)
                    break
                identifier = id_converter._retrieve_from_drugbank(name)
                if identifier is not None:
                    cache[name] = identifier
                    result.append(identifier)
                    found = True
                    break
            if not found:
                return None
    return result      
    
    
def regex_or_x(x, regexes):
    for regex in regexes:
        res = regex.findall(x)
        if res != []:
            x = res[0]
    return x

def regex_on_interventions_with_other_names(arr, regs):
    original_name = arr[0]
    others = arr[1]
    return [regex_or_x(original_name, regs),
             [regex_or_x(other_name, regs) for other_name in others]]

def clean_drug_names(df, original_col, dest_col):
    result_df = df.copy()
    comparator_regex = re.compile('Comparator: (.*)')
    remove_mg_kg = re.compile('^(.*?)(?:(?:\/\d)|(?: \d)|(?:,(?:.*)\d)).*?(?:mg|kg|μg)(?:.*?)$')
    regs = [comparator_regex, remove_mg_kg]
    result_df[dest_col] = result_df[original_col].apply(lambda x: regex_on_interventions_with_other_names(x, regs))
    return result_df
    
years_regex = re.compile('\d{4}')
    
id_converter = Identifiers_converter() 

# Unaggregated version
Here we have an entry per intervention

## Reading and cleaning

In [3]:
unag_df = pd.read_csv('../../pickles/data/drugCombs/interventions_not_aggregated.csv')
unag_df['phase'] = unag_df['phase'].fillna("Doesn't Exist")
unag_df['overall_status'] = unag_df['overall_status'].fillna("Doesn't Exist")
unag_df['study_started_year'] = unag_df['study_start_date'].astype(str).apply(lambda x: None if years_regex.match(x) is None else years_regex.match(x)[0])
unag_df['interventions_with_other_names'] = unag_df['interventions_with_other_names'].apply(lambda x: json.loads(x))
unag_df['is_fda_regulated_drug'] = unag_df['is_fda_regulated_drug'].fillna("unknown")
unag_df.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,nct_id,study_start_date,phase,overall_status,last_known_status,is_fda_regulated_drug,design_group_id,interventions_id,group_type,title,intervention_name,interventions_with_other_names,intervention_description,mesh_terms,downcase_mesh_terms,condition_names,condition_downcase_names,study_started_year
0,NCT00000134,1992-12-31,Phase 3,Completed,,unknown,5002853,4895425,Active Comparator,combination therapy,Ganciclovir,"[Ganciclovir, [cytovene]]",intravenous ganciclovir induction at 5 mg/kg t...,"[""HIV Infections"", ""Acquired Immunodeficiency ...","[""hiv infections"", ""acquired immunodeficiency ...","[""HIV Infections"", ""Acquired Immunodeficiency ...","[""hiv infections"", ""acquired immunodeficiency ...",1992
1,NCT00000134,1992-12-31,Phase 3,Completed,,unknown,5002853,4895426,Active Comparator,combination therapy,Foscarnet,"[Foscarnet, [foscavir]]",intravenous foscarnet induction at 90 mg/kg tw...,"[""HIV Infections"", ""Acquired Immunodeficiency ...","[""hiv infections"", ""acquired immunodeficiency ...","[""HIV Infections"", ""Acquired Immunodeficiency ...","[""hiv infections"", ""acquired immunodeficiency ...",1992
2,NCT00000136,1990-03-31,Phase 3,Completed,,unknown,5002848,4895421,Experimental,Ganciclovir,Foscarnet,"[Foscarnet, [Foscavir]]","60 mg/kg every 8 hours, 90 mg/kg/day","[""Cytomegalovirus Retinitis"", ""Retinitis""]","[""cytomegalovirus retinitis"", ""retinitis""]","[""HIV Infections"", ""Cytomegalovirus Retinitis""]","[""hiv infections"", ""cytomegalovirus retinitis""]",1990
3,NCT00000136,1990-03-31,Phase 3,Completed,,unknown,5002847,4895422,Experimental,Foscarnet,Ganciclovir,"[Ganciclovir, [Vitraset]]","5 mg/kg every 12 hours, 5 mg/kg every 24 hours","[""Cytomegalovirus Retinitis"", ""Retinitis""]","[""cytomegalovirus retinitis"", ""retinitis""]","[""HIV Infections"", ""Cytomegalovirus Retinitis""]","[""hiv infections"", ""cytomegalovirus retinitis""]",1990
4,NCT00000136,1990-03-31,Phase 3,Completed,,unknown,5002848,4895422,Experimental,Ganciclovir,Ganciclovir,"[Ganciclovir, [Vitraset]]","5 mg/kg every 12 hours, 5 mg/kg every 24 hours","[""Cytomegalovirus Retinitis"", ""Retinitis""]","[""cytomegalovirus retinitis"", ""retinitis""]","[""HIV Infections"", ""Cytomegalovirus Retinitis""]","[""hiv infections"", ""cytomegalovirus retinitis""]",1990


### Fix names

In [4]:
unag_df = clean_drug_names(unag_df, 'interventions_with_other_names', 'cleaned_intervention_names')
unag_df['cleaned'] = unag_df['interventions_with_other_names'] != unag_df['cleaned_intervention_names']
print(f"Cleaned total of: {unag_df['cleaned'].sum()} rows")

Cleaned total of: 6630 rows


#### Calculate how many drugs we have per arm (combination size)

In [5]:
dgi_to_amount = pd.DataFrame(unag_df.groupby("design_group_id")['cleaned'].count()).reset_index()
dgi_to_amount.columns = ['design_group_id', "num_of_drugs"]
dgi_to_amount

Unnamed: 0,design_group_id,num_of_drugs
0,4415125,2
1,4415126,2
2,4415127,2
3,4415200,3
4,4415201,2
...,...,...
49650,5357809,2
49651,5357927,3
49652,5357937,3
49653,5357938,3


## Add drugbank ids

### Load cache of drugids for name

In [6]:
cache_exists = os.path.exists('cache.json')
drugbank_cache = {}
if cache_exists:
    with open('cache.json', 'r') as cache_file:
        drugbank_cache = json.load(cache_file)
    print(f"cache size: {len(drugbank_cache)}")


cache size: 31239


### Fetch drugbank ids from remote server

In [7]:
unag_df['drugbank_ids'] = unag_df['cleaned_intervention_names']\
        .progress_map(lambda x: get_drug_identifier_single(id_converter, x, drugbank_cache))

100%|██████████| 133552/133552 [00:00<00:00, 449928.74it/s]


### Save the updated cache

In [8]:
with open('cache.json', 'w') as cache_file:
    json.dump(drugbank_cache, cache_file)

### Save results to csv

In [9]:
unag_df.to_csv("drug_combs_with_phase_and_cond_and_other_names_with_dbid.csv")
entries_with_dbid = unag_df[unag_df['drugbank_ids'].isnull() == False]
print(f"{len(unag_df[unag_df['drugbank_ids'].isnull() == False])/len(unag_df)} got db_id")

0.6872978315562478 got db_id


In [None]:
combs = pd.DataFrame(entries_with_dbid.groupby(['nct_id', 'study_start_date', 'phase', 'overall_status', 
                                                'design_group_id', 'is_fda_regulated_drug',
       'group_type', 'title', 'mesh_terms', 'downcase_mesh_terms', 'condition_names',
       'condition_downcase_names', 'study_started_year'])['drugbank_ids'].apply(list))
combs = combs.reset_index()
combs['num_of_drugs_with_dbid'] = combs['drugbank_ids'].apply(lambda x: len(x))
combs = combs[combs['num_of_drugs_with_dbid'] > 1]
combs = combs.merge(dgi_to_amount, on="design_group_id")
combs = combs[combs['num_of_drugs'] == combs['num_of_drugs_with_dbid']]
print(f"Got total of:{len(combs)} combs")

## What kind of combinations do we have?

In [None]:
combs

In [None]:
by_phase_df = pd.DataFrame(combs['phase'].value_counts()).reset_index()
by_phase_df.columns = ["Phase", "Count"]
plt.figure(figsize=(14, 6))
plt.title("Drug Combinations Arms By Phase")
plt.xlabel("Phase")
plt.ylabel("Amount of arm group with multiple drugs")
plt.bar(by_phase_df['Phase'], by_phase_df['Count'])

by_overall_status_df = pd.DataFrame(combs['overall_status'].value_counts()).reset_index()
by_overall_status_df.columns = ["overall_status", "Count"]
plt.figure(figsize=(8, 8))
plt.title("Drug Combinations Arms By overall_status")
plt.xlabel("Phase")
plt.ylabel("Amount of arm group with multiple drugs")
plt.barh(by_overall_status_df['overall_status'], by_overall_status_df['Count'])
sns.barplot(by_overall_status_df['Count'], by_overall_status_df['overall_status'])

by_study_started_year = pd.DataFrame(combs['study_started_year'].dropna().astype(int).value_counts()).reset_index()
by_study_started_year = by_study_started_year.sort_values(by="study_started_year", ascending=True)
by_study_started_year.columns = ["study_started_year", "Count"]
plt.figure(figsize=(8, 8))
plt.title("Drug Combinations Arms By study_started_year")
plt.xlabel("study_started_year")
plt.ylabel("Amount of arm group with multiple drugs")
by_study_started_year = by_study_started_year[by_study_started_year['study_started_year'] >= 1987]
# sns.barplot(by_study_started_year['study_started_year'], by_study_started_year['Count'])
plt.bar(by_study_started_year['study_started_year'], by_study_started_year['Count'])

In [None]:
neoplasams_combs = combs[combs['downcase_mesh_terms'].str.contains('neoplasms')]
neoplasams_combs = neoplasams_combs[neoplasams_combs['study_started_year'].astype('int') <= 2014]
neoplasams_combs = neoplasams_combs[neoplasams_combs['overall_status'] == 'Completed']

In [None]:
two_combs_df = neoplasams_combs[neoplasams_combs['num_of_drugs_with_dbid'] == 2]
two_combs_df['first_drug'] = neoplasams_combs['drugbank_ids'].apply(lambda x: x[0])
two_combs_df['second_drug'] = neoplasams_combs['drugbank_ids'].apply(lambda x: x[1])
two_combs_df