In [1]:
import pandas as pd
import numpy as np
import rdkit.Chem.inchi as inchi



Samples and Drugs files from each version of the repurposing hub are read

In [2]:
samples_2017 = pd.read_csv('clue/repurposing_samples_20170327.txt',usecols=['broad_id','pert_iname','InChIKey'],delimiter='\t', comment='!', encoding='iso-8859-1')
drugs_2017 = pd.read_csv('clue/repurposing_drugs_20170327.txt', usecols=['pert_iname','moa','target'], delimiter='\t', comment='!', encoding='iso-8859-1')
samples_2018a = pd.read_csv('clue/repurposing_samples_20180516.txt', usecols=['broad_id','pert_iname','InChIKey','deprecated_broad_id'], delimiter='\t', comment='!', encoding='iso-8859-1')
drugs_2018a = pd.read_csv('clue/repurposing_drugs_20180516.txt', usecols=['pert_iname','moa','target'], delimiter='\t', comment='!', encoding='iso-8859-1')
samples_2018b = pd.read_csv('clue/repurposing_samples_20180907.txt', usecols=['broad_id','pert_iname','InChIKey','deprecated_broad_id'], delimiter='\t', comment='!', encoding='iso-8859-1')
drugs_2018b = pd.read_csv('clue/repurposing_drugs_20180907.txt', usecols=['pert_iname','moa','target'], delimiter='\t', comment='!', encoding='iso-8859-1')
samples_2020 = pd.read_csv('clue/repurposing_samples_20200324.txt', usecols=['broad_id','pert_iname','InChIKey','deprecated_broad_id'], delimiter='\t', comment='!', encoding='iso-8859-1')
drugs_2020 = pd.read_csv('clue/repurposing_drugs_20200324.txt', usecols=['pert_iname','moa','target'], delimiter='\t', comment='!', encoding='iso-8859-1')

In [3]:
# 2017 version is missing deprecated_broad_id
samples_2017['deprecated_broad_id'] = np.nan

In [4]:
# Maps the samples to their moa and target annotations

def target_annotation(samples, drugs):
    samples = samples.merge(drugs, on='pert_iname', how='left')
    return samples

# Replace InChI with InChIKey

def inchi_to_inchikey(df):
    df.dropna(subset=['InChIKey'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.InChIKey = df.InChIKey.apply(lambda x: inchi.InchiToInchiKey(x) if (x.startswith('InChI')) else x)
    return df

# The first 13 characters of the Broad ID and the first 14 characters of InChIKey are extracted
# Year names are appended to column names

def id_cleanup(df, year):
    df.broad_id = df.broad_id.apply(lambda x: str(x)[:13])
    df.InChIKey = df.InChIKey.apply(lambda x: str(x)[:14])
    df = df.drop_duplicates(['InChIKey','pert_iname','broad_id','deprecated_broad_id']).reset_index(drop=True)
    df = df.rename(columns={'pert_iname':'pert_iname_'+year,
                            'broad_id':'broad_id_'+year,
                            'deprecated_broad_id':'deprecated_broad_id_'+year,
                            'InChIKey':'InChIKey14',
                            'moa':'moa_'+year,
                            'target':'target_'+year})
    return df

# Grouping samples using InChIKey14 while all other fields are pipe delimited

def group_by_InChIKey14(df, year):
    df = df.fillna('')
    df = df.groupby('InChIKey14').agg({'broad_id_'+year : lambda x: '|'.join(np.unique(x)),
                                       'deprecated_broad_id_'+year: lambda x: merge_target(list(x)),
                                       'pert_iname_'+year: lambda x: '|'.join(np.unique(x)),
                                       'moa_'+year : lambda x: merge_target(list(x)),
                                       'target_'+year: lambda x: merge_target(list(x))}).reset_index()
    return df

# This function deplicates target annotations and the final list is pipe delimited

def merge_target(target):
    joined_target = ('|'.join(target)).split('|')
    while '' in joined_target:
        joined_target.remove('')
    out_target = '|'.join(np.unique(joined_target))
    return out_target

In [5]:
samples_2017 = inchi_to_inchikey(samples_2017)
samples_2018a = inchi_to_inchikey(samples_2018a)
samples_2018b = inchi_to_inchikey(samples_2018b)
samples_2020 = inchi_to_inchikey(samples_2020)

In [6]:
samples_2017 = target_annotation(samples_2017, drugs_2017)
samples_2018a = target_annotation(samples_2018a, drugs_2018a)
samples_2018b = target_annotation(samples_2018b, drugs_2018b)
samples_2020 = target_annotation(samples_2020, drugs_2020)

samples_2017.head()

Unnamed: 0,broad_id,pert_iname,InChIKey,deprecated_broad_id,moa,target
0,BRD-A37752546-001-01-9,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,NFONIVRMILHYLH-UHFFFAOYSA-N,,,
1,BRD-K89787693-001-01-1,"[sar9,met(o2)11]-substance-p",OUPXSLGGCPUZJJ-SARDKLJWSA-N,,,
2,BRD-K88956297-003-01-9,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",LDLCEGCJYSDJLX-UPHRSURJSA-N,,,
3,BRD-A86415025-050-01-0,"1-(1,2-Diphenylethyl)piperidine-(+/-)",JQWJJJYHVHNXJH-UHFFFAOYSA-N,,,
4,BRD-A95802703-001-01-0,1-(2-chloro-5-methylphenoxy)-3-(isopropylamino...,NJEIOWSBPCZKTL-UHFFFAOYSA-N,,,


In [7]:
samples_2017 = id_cleanup(samples_2017, "2017")
samples_2018a = id_cleanup(samples_2018a, "2018a")
samples_2018b = id_cleanup(samples_2018b, "2018b")
samples_2020 = id_cleanup(samples_2020, "2020")

samples_2017.head()

Unnamed: 0,broad_id_2017,pert_iname_2017,InChIKey14,deprecated_broad_id_2017,moa_2017,target_2017
0,BRD-A37752546,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,NFONIVRMILHYLH,,,
1,BRD-K89787693,"[sar9,met(o2)11]-substance-p",OUPXSLGGCPUZJJ,,,
2,BRD-K88956297,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",LDLCEGCJYSDJLX,,,
3,BRD-A86415025,"1-(1,2-Diphenylethyl)piperidine-(+/-)",JQWJJJYHVHNXJH,,,
4,BRD-A95802703,1-(2-chloro-5-methylphenoxy)-3-(isopropylamino...,NJEIOWSBPCZKTL,,,


In [8]:
samples_2017 = group_by_InChIKey14(samples_2017, "2017")
samples_2018a = group_by_InChIKey14(samples_2018a, "2018a")
samples_2018b = group_by_InChIKey14(samples_2018b, "2018b")
samples_2020 = group_by_InChIKey14(samples_2020, "2020")

samples_2017.head()

Unnamed: 0,InChIKey14,broad_id_2017,deprecated_broad_id_2017,pert_iname_2017,moa_2017,target_2017
0,AAALVYBICLMAMA,BRD-K13087974,,CGP-52411,EGFR inhibitor,EGFR
1,AAAQFGUYHFJNHI,BRD-K08109215,,I-BET-762,bromodomain inhibitor,BRD2|BRD3|BRD4
2,AADCDMQTJNYOSS,BRD-K50417881,,eticlopride,dopamine receptor antagonist,DRD2|DRD3|DRD4
3,AAFJXZWCNVJTMK,BRD-K68502831,,dianhydrogalactitol,DNA alkylating agent,
4,AAGFPTSOPGCENQ,BRD-K10999968|BRD-K66845263,,sophocarpine,,


The four dataframes are merged on InChIKey14

In [9]:
merged_df = samples_2017.merge(samples_2018a, on='InChIKey14', how='outer')
merged_df = merged_df.merge(samples_2018b, on='InChIKey14', how='outer')
merged_df = merged_df.merge(samples_2020, on='InChIKey14', how='outer')

merged_df= merged_df.replace(to_replace='', value=np.nan)

merged_df.head()

Unnamed: 0,InChIKey14,broad_id_2017,deprecated_broad_id_2017,pert_iname_2017,moa_2017,target_2017,broad_id_2018a,deprecated_broad_id_2018a,pert_iname_2018a,moa_2018a,...,broad_id_2018b,deprecated_broad_id_2018b,pert_iname_2018b,moa_2018b,target_2018b,broad_id_2020,deprecated_broad_id_2020,pert_iname_2020,moa_2020,target_2020
0,AAALVYBICLMAMA,BRD-K13087974,,CGP-52411,EGFR inhibitor,EGFR,BRD-K13087974,,CGP-52411,EGFR inhibitor,...,BRD-K13087974,,CGP-52411,EGFR inhibitor,EGFR,BRD-K13087974,,CGP-52411,EGFR inhibitor,EGFR
1,AAAQFGUYHFJNHI,BRD-K08109215,,I-BET-762,bromodomain inhibitor,BRD2|BRD3|BRD4,BRD-K08109215,,I-BET-762,bromodomain inhibitor,...,BRD-K08109215,,I-BET-762,bromodomain inhibitor,BRD2|BRD3|BRD4,BRD-K08109215,,I-BET-762,bromodomain inhibitor,BRD2|BRD3|BRD4
2,AADCDMQTJNYOSS,BRD-K50417881,,eticlopride,dopamine receptor antagonist,DRD2|DRD3|DRD4,BRD-K50417881,,eticlopride,dopamine receptor antagonist,...,BRD-K50417881,,eticlopride,dopamine receptor antagonist,DRD2|DRD3|DRD4,BRD-K50417881,,eticlopride,dopamine receptor antagonist,DRD2|DRD3|DRD4
3,AAFJXZWCNVJTMK,BRD-K68502831,,dianhydrogalactitol,DNA alkylating agent,,BRD-K68502831,,dianhydrogalactitol,DNA alkylating agent,...,BRD-K68502831,,dianhydrogalactitol,DNA alkylating agent,,BRD-K68502831,,dianhydrogalactitol,DNA alkylating agent,
4,AAGFPTSOPGCENQ,BRD-K10999968|BRD-K66845263,,sophocarpine,,,BRD-K10999968|BRD-K66845263,,sophocarpine,,...,BRD-K66845263,BRD-K10999968-001-01-4,sophocarpine,,,BRD-K66845263,BRD-K10999968-001-01-4,sophocarpine,,


In [10]:
merged_df.to_csv('clue/broad_id_map.csv', index=False)

In [11]:
print('Total number of rows %d' % len(merged_df))

Total number of rows 6959


The following compounds in the 2020 version have not been formatted correctly
as InChI is being extracted instead of InChIKey for these compounds.

In [12]:
print(merged_df.loc[merged_df.InChIKey14.str.startswith('InChI')][['InChIKey14','pert_iname_2020']].to_markdown())





| InChIKey14   | pert_iname_2020   |
||
