# Merging Compound files distributed by CLUE

We consolidate drug and sample resources information into a single file for easier downstream processing.

The data were originally retrieved from https://clue.io/repurposing.
See [`clue/README.md`](clue/README.md) for more details.

In [1]:
import os
import numpy as np
import pandas as pd

## Load Data

In [2]:
data_dir = "clue"
date = "20180907"

In [3]:
drug_file = os.path.join(data_dir, "repurposing_drugs_{}.txt".format(date))
drug_df = pd.read_csv(drug_file, encoding = "ISO-8859-1", sep='\t', skiprows=9)

print(drug_df.shape)
drug_df.head(2)

(6125, 6)


Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,


In [4]:
sample_file = os.path.join(data_dir, "repurposing_samples_{}.txt".format(date))
sample_df = pd.read_csv(sample_file, encoding = "ISO-8859-1", sep='\t', skiprows=9)

print(sample_df.shape)
sample_df.head(2)

(10147, 12)


Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K89787693-001-01-1,"[sar9,met(o2)11]-substance-p",0,,Tocris,1178,2-({5-amino-2-[({1-[6-amino-2-({[1-(2-amino-5-...,1392.73,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,
1,BRD-K88956297-003-01-9,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",0,94.41,BidePharm,BD51690,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",215.106,Cl\C=C/C[N+]12CN3CN(CN(C3)C1)C2,LDLCEGCJYSDJLX-UPHRSURJSA-N,5846454.0,


## Checking for `pert_iname` Discrepancies

In [5]:
assert len(set(drug_df.pert_iname.values).difference(set(sample_df.pert_iname))) == 0
set(sample_df.pert_iname.values).difference(set(drug_df.pert_iname))

{'YM-298198-desmethyl', 'golgicide-A'}

Two perturbation names (`pert_iname`) are inconsistent.
Work towards reconciliation.

### `YM-298198-desmethyl` 

In [6]:
sample_df.loc[sample_df.pert_iname.str.contains("298198"), :]

Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
10019,BRD-K59650319-003-03-3,YM-298198,0,61.48,Tocris,2448,YM 298198 hydrochloride,342.151,CN(C1CCCCC1)C(=O)c1sc2nc3ccc(N)cc3n2c1C,KCBXOMYXOBVLED-UHFFFAOYSA-N,9819432.0,
10020,BRD-K48059230-003-02-9,YM-298198-desmethyl,0,68.02,Tocris,2447,Desmethyl-YM 298198,328.136,Cc1c(sc2nc3ccc(N)cc3n12)C(=O)NC1CCCCC1,VWOMTTIMBHBPBI-UHFFFAOYSA-N,11716890.0,


In [7]:
ym_drug = drug_df.loc[drug_df.pert_iname.str.contains("298198"), :].reset_index(drop=True)
ym_drug.loc[0, "pert_iname"] = "YM-298198-desmethyl"
ym_drug

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,YM-298198-desmethyl,Preclinical,glutamate receptor antagonist,GRM1,,


In [8]:
# solve the YM-298198 problem
drug_df = pd.concat([drug_df, ym_drug], axis="rows").reset_index(drop=True)

#### `YM-298198-desmethyl` is absent in the drug data

[`YM-298198-desmethyl`](https://www.tocris.com/products/desmethyl-ym-298198_2447) is a derivative of [`YM-298198`](https://www.tocris.com/products/ym-298198-hydrochloride_2448), and therefore has a different structure. However, their MOA and target is the same.

### `golgicide-A`

In [9]:
sample_df.loc[sample_df.pert_iname.str.contains("golgicide"), :]

Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
4255,BRD-A57886255-001-02-9,golgicide-A,0,97.26,Selleck,S7266,Golgicide A,284.113,Fc1cc(F)c2NC([C@@H]3CC=C[C@@H]3c2c1)c1cccnc1,NJZHEQOUHLZCOX-FTLRAWMYSA-N,25113626.0,BRD-A53244165-001-01-6
4256,BRD-A57886255-001-01-1,golgicide-a,0,96.27,Tocris,3584,Golgicide A,284.113,Fc1cc(F)c2NC([C@@H]3CC=C[C@@H]3c2c1)c1cccnc1,NJZHEQOUHLZCOX-FTLRAWMYSA-N,25113626.0,


In [10]:
drug_df.loc[drug_df.pert_iname.str.contains("golgicide"), :]

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
2383,golgicide-a,Preclinical,ARF inhibitor,GBF1,,


#### `golgicide` only differs by a capitalization and is equivalent

In [11]:
# Solve the golgicide problem
sample_df.loc[sample_df.pert_iname.str.contains("golgicide"), "pert_iname"] = "golgicide-a"

In [12]:
# Now, assert that there are no differences
assert len(set(sample_df.pert_iname.values).difference(set(drug_df.pert_iname))) == 0

## Merge the Samples and Drugs data

In [13]:
combined_df = (
    drug_df.merge(
        sample_df,
        on="pert_iname",
        how="inner"
    )
    .reset_index(drop=True)
)

# Move broad_id to first column
col_order = combined_df.columns.tolist()
col_order.insert(0, col_order.pop(col_order.index('broad_id')))
combined_df = combined_df.loc[:, col_order]

# Output to file
output_file = "repurposing_info"
combined_df.to_csv("{}.tsv".format(output_file), sep='\t', index=False)

print(combined_df.shape)
combined_df.head()

(10147, 17)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K89787693-001-01-1,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,,0,,Tocris,1178,2-({5-amino-2-[({1-[6-amino-2-({[1-(2-amino-5-...,1392.73,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,
1,BRD-K70329400-001-02-5,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,,0,97.36,Tocris,4431,A 1070722,362.099,COc1ccc2c(NC(=O)Nc3cccc(n3)C(F)(F)F)ccnc2c1,VQPBIJGXSXEOCU-UHFFFAOYSA-N,49830684.0,
2,BRD-K59060513-001-02-5,A-1120,Preclinical,retinoid receptor ligand,RBP4,,,0,83.72,Tocris,3793,A 1120,392.135,OC(=O)c1ccccc1NC(=O)N1CCC(CC1)c1ccccc1C(F)(F)F,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0,
3,BRD-K38019854-323-01-4,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,,0,98.69,MedChemEx,HY-15568A,A-317491 (sodium salt hydrate),565.174,OC(=O)c1cc(C(O)=O)c(cc1C(O)=O)C(=O)N(Cc1cccc(O...,VQGBOYBIENNKMI-LJAQVGFWSA-N,9829395.0,
4,BRD-K38019854-001-01-6,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,,0,98.06,MedChemEx,HY-15568,A-317491,565.174,OC(=O)c1cc(C(O)=O)c(cc1C(O)=O)C(=O)N(Cc1cccc(O...,VQGBOYBIENNKMI-LJAQVGFWSA-N,9829395.0,


## Create a "Long" version where we split MOA and Target delimiters

Certain compounds have multiple MOA classes and targets that are delimited by pipes (`|`).
Each MOA class and target can be considered to have equal support (see https://github.com/broadinstitute/lincs-cell-painting/issues/5).

Split the combined data on both MOA and target along each pipe and elongate the table.
This is done to reduce computational burden of multiple downstream analyses performing the same splits.

In [14]:
# The splitting strategy does not work with missing values
# Add a dummy variable, that will be replaced downstream
combined_df.moa = combined_df.moa.fillna("replace_with_na")
combined_df.target = combined_df.target.fillna("replace_with_na")

In [15]:
# Make sure the original index is preserved
split_col_index = "{}_index".format(output_file)

In [16]:
moa_split_df = (
    pd.DataFrame(combined_df.moa.str.split("|").tolist(), index=combined_df.index)
    .stack()
    .reset_index()
)
moa_split_df.columns = [split_col_index, "_", "moa_unique"]

print(moa_split_df.shape)
moa_split_df.head()

(11253, 3)


Unnamed: 0,repurposing_info_index,_,moa_unique
0,0,0,tachykinin antagonist
1,1,0,glycogen synthase kinase inhibitor
2,2,0,retinoid receptor ligand
3,3,0,purinergic receptor antagonist
4,4,0,purinergic receptor antagonist


In [17]:
target_split_df = (
    pd.DataFrame(combined_df.target.str.split("|").tolist(), index=combined_df.index)
    .stack()
    .reset_index()
)

target_split_df.columns = [split_col_index, "_", "target_unique"]

print(target_split_df.shape)
target_split_df.head()

(27778, 3)


Unnamed: 0,repurposing_info_index,_,target_unique
0,0,0,TACR1
1,1,0,GSK3A
2,1,1,GSK3B
3,2,0,RBP4
4,3,0,P2RX3


In [18]:
long_combined_df = (
    combined_df
    .merge(
        moa_split_df.loc[:, [split_col_index, "moa_unique"]],
        left_index=True,
        right_on=split_col_index,
        how="left"
    )
    .merge(
        target_split_df.loc[:, [split_col_index, "target_unique"]],
        on=split_col_index,
        how="left"
    )
    .reset_index(drop=True)
)

# Put back missing values
long_combined_df.loc[long_combined_df.moa == "replace_with_na", "moa"] = np.nan
long_combined_df.loc[long_combined_df.moa_unique == "replace_with_na", "moa_unique"] = np.nan
long_combined_df.loc[long_combined_df.target == "replace_with_na", "target"] = np.nan
long_combined_df.loc[long_combined_df.target_unique == "replace_with_na", "target_unique"] = np.nan

# Output to file
output_file = "repurposing_info_long.tsv"
long_combined_df.to_csv(output_file, sep='\t', index=False)

print(long_combined_df.shape)
long_combined_df.head()

(34328, 20)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id,repurposing_info_index,moa_unique,target_unique
0,BRD-K89787693-001-01-1,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,,0,,Tocris,1178,2-({5-amino-2-[({1-[6-amino-2-({[1-(2-amino-5-...,1392.73,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,,0,tachykinin antagonist,TACR1
1,BRD-K70329400-001-02-5,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,,0,97.36,Tocris,4431,A 1070722,362.099,COc1ccc2c(NC(=O)Nc3cccc(n3)C(F)(F)F)ccnc2c1,VQPBIJGXSXEOCU-UHFFFAOYSA-N,49830684.0,,1,glycogen synthase kinase inhibitor,GSK3A
2,BRD-K70329400-001-02-5,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,,0,97.36,Tocris,4431,A 1070722,362.099,COc1ccc2c(NC(=O)Nc3cccc(n3)C(F)(F)F)ccnc2c1,VQPBIJGXSXEOCU-UHFFFAOYSA-N,49830684.0,,1,glycogen synthase kinase inhibitor,GSK3B
3,BRD-K59060513-001-02-5,A-1120,Preclinical,retinoid receptor ligand,RBP4,,,0,83.72,Tocris,3793,A 1120,392.135,OC(=O)c1ccccc1NC(=O)N1CCC(CC1)c1ccccc1C(F)(F)F,MEAQCLPMSVEOQF-UHFFFAOYSA-N,25138295.0,,2,retinoid receptor ligand,RBP4
4,BRD-K38019854-323-01-4,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,,0,98.69,MedChemEx,HY-15568A,A-317491 (sodium salt hydrate),565.174,OC(=O)c1cc(C(O)=O)c(cc1C(O)=O)C(=O)N(Cc1cccc(O...,VQGBOYBIENNKMI-LJAQVGFWSA-N,9829395.0,,3,purinergic receptor antagonist,P2RX3
