In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### Process data for clinical trial drugs

In [9]:
df_drug_moa_ct = pd.read_csv('Drug-MOA-CT.csv')
df_drug_target_ct = pd.read_csv('Drug-Target-CT.csv')
df_drug_indication_ct = pd.read_csv('Drug-Indication-CT.csv')
df_drug_pathway_ct = pd.read_csv('Drug-Pathway-CT.csv')

In [4]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    idx_cols = df.columns.difference(lst_cols)
    lens = df[lst_cols[0]].str.len()   
    idx = np.repeat(df.index.values, lens)
    res = (pd.DataFrame({col:np.repeat(df[col].values, lens)for col in idx_cols},index=idx).assign(**{col:np.concatenate(df.loc[lens>0, col].values)for col in lst_cols}))
    if (lens == 0).any():
        res = (res.append(df.loc[lens==0, idx_cols], sort=False).fillna(fill_value))
    res = res.sort_index()
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [10]:
df_drug_moa_ct = df_drug_moa_ct.dropna()
cols = df_drug_moa_ct.columns
df_drug_moa_ct[cols[0]] = df_drug_moa_ct[cols[0]].apply(lambda x:x.lower())
df_drug_moa_ct[cols[1]] = df_drug_moa_ct[cols[1]].apply(lambda x:x.lower())
df_drug_moa_mod = explode(df_drug_moa_ct.assign(MOA=df_drug_moa_ct.MOA.str.split(',')),'MOA')

In [11]:
df_drug_indication_ct = df_drug_indication_ct.dropna()
df_drug_indication_mod = df_drug_indication_ct[(df_drug_indication_ct.Indication!='NA')]
cols = df_drug_indication_mod.columns
df_drug_indication_mod[cols[0]] = df_drug_indication_mod[cols[0]].apply(lambda x:x.lower())
df_drug_indication_mod[cols[1]] = df_drug_indication_mod[cols[1]].apply(lambda x:x.lower())
df_drug_indication_mod = explode(df_drug_indication_mod.assign(Indication=df_drug_indication_mod.Indication.str.split(',')),'Indication')

In [27]:
df_drug_pathway_ct = df_drug_pathway_ct.dropna()
df_drug_pathway_mod = df_drug_pathway_ct[(df_drug_pathway_ct.Pathway!='NA')]
df_drug_pathway_mod = df_drug_pathway_mod[(df_drug_pathway_ct.Pathway!='No')]
cols = df_drug_pathway_mod.columns
df_drug_pathway_mod[cols[0]] = df_drug_pathway_mod[cols[0]].apply(lambda x:x.lower())
df_drug_pathway_mod[cols[1]] = df_drug_pathway_mod[cols[1]].apply(lambda x:x.lower())
df_drug_pathway_mod = explode(df_drug_pathway_mod.assign(Pathway=df_drug_pathway_mod.Pathway.str.split(',')),'Pathway')

In [18]:
df_drug_target_ct = df_drug_target_ct.dropna()
df_drug_target_mod = df_drug_target_ct[(df_drug_target_ct.Target!='NA')]
cols = df_drug_target_mod.columns
df_drug_target_mod[cols[0]] = df_drug_target_mod[cols[0]].apply(lambda x:x.lower())
df_drug_target_mod = explode(df_drug_target_mod.assign(Target=df_drug_target_mod.Target.str.split(',')),'Target')

### Combine with drug downselection data

In [14]:
df_drug_moa_ds = pd.read_csv('drug-MOA.csv')
df_drug_target_ds = pd.read_csv('drug-target.csv')
df_drug_indication_ds = pd.read_csv('drug-indication.csv')
df_drug_pathway_ds = pd.read_csv('drug-pathway.csv')

In [28]:
df_drug_moa = pd.concat([df_drug_moa_ds,df_drug_moa_mod])
df_drug_target = pd.concat([df_drug_target_ds,df_drug_target_mod])
df_drug_indication = pd.concat([df_drug_indication_ds,df_drug_indication_mod])
df_drug_pathway = pd.concat([df_drug_pathway_ds,df_drug_pathway_mod])

### Convert to 0-1 relationship matrices 

In [20]:
cols = df_drug_moa.columns
df_moa = pd.crosstab(df_drug_moa[cols[0]],df_drug_moa[cols[1]])

In [22]:
cols = df_drug_target.columns
df_target = pd.crosstab(df_drug_target[cols[0]],df_drug_target[cols[1]])

In [23]:
cols = df_drug_indication.columns
df_indication = pd.crosstab(df_drug_indication[cols[0]],df_drug_indication[cols[1]])

In [29]:
cols = df_drug_pathway.columns
df_pathway = pd.crosstab(df_drug_pathway[cols[0]],df_drug_pathway[cols[1]])

### Save to csv file

In [31]:
df_moa.to_csv('df_moa_relationship.csv')
df_target.to_csv('df_target_relationship.csv')
df_indication.to_csv('df_indication_relationship.csv')
df_pathway.to_csv('df_pathway_relationship.csv')

### Preprocess numerical features

In [2]:
df_drug_features = pd.read_csv('all_drug_data.csv')

In [3]:
df_drug_features.fillna(0,inplace=True)

In [5]:
df_drug_features_mod = df_drug_features.replace({'Inactive': '0'})
df_drug_features_mod = df_drug_features_mod.replace({'Active': '1'})
df_drug_features_mod = df_drug_features_mod.replace({'Cytotoxic': '1'})
df_drug_features_mod = df_drug_features_mod.replace({'Inconclusive': '0.5'})

In [6]:
df_drug_features_mod.to_csv('all_drug_data_processed.csv')