# Create Map between Old and New IDs

Many Broad and Perturbation IDs have deprecated IDs.
I will use this notebook to generate a map between current and deprecated/legacy identifiers.

In [1]:
import os
import pandas as pd

In [2]:
# Load and subset repurposing info to deprecated values
annotation_df = (
    pd.read_csv("repurposing_info.tsv", sep='\t')
    .dropna(subset=["deprecated_broad_id"])
    .reset_index(drop=True)
)

print(annotation_df.shape)
annotation_df.head(3)

(764, 17)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K76894955-001-02-1,A-804598,Preclinical,purinergic receptor antagonist,P2RX7,,,0,98.54,Tocris,4473,A 804598,315.148,C[C@H](\N=C(\NC#N)Nc1cccc2ncccc12)c1ccccc1,PQYCRDPLPKGSME-AWEZNQCLSA-N,53325874.0,BRD-K94313941-001-01-9
1,BRD-K17443395-065-03-0,abacavir,Launched,nucleoside reverse transcriptase inhibitor,,infectious disease,human immunodeficiency virus (HIV-1),0,95.95,Tocris,4148,Abacavir hemisulfate,286.154,Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1 ...,MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0,BRD-M55852627-065-01-7
2,BRD-K17443395-065-02-2,abacavir,Launched,nucleoside reverse transcriptase inhibitor,,infectious disease,human immunodeficiency virus (HIV-1),0,98.81,MicroSource,1502410,ABACAVIR SULFATE,286.154,Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1 ...,MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0,BRD-A95032015-065-01-2


In [3]:
# Split deprecated broad IDs
deprecated_id_df = (
    pd.DataFrame(
        annotation_df.deprecated_broad_id.str.split("|").tolist(),
        index=annotation_df.broad_id
    )
    .stack()
    .reset_index()
    .drop("level_1", axis="columns")
)

deprecated_id_df.columns = ["broad_id", "deprecated_broad_id"]
deprecated_id_df = (
    deprecated_id_df
    .assign(
        pert_id=deprecated_id_df.broad_id.str.slice(0, 13),
        deprecated_pert_id=deprecated_id_df.deprecated_broad_id.str.slice(0, 13)
    )
    .drop_duplicates()
)

col_order = ["broad_id", "pert_id", "deprecated_broad_id", "deprecated_pert_id"]
deprecated_id_df = deprecated_id_df.loc[:, col_order]

print(deprecated_id_df.shape)
deprecated_id_df.head()

(809, 4)


Unnamed: 0,broad_id,pert_id,deprecated_broad_id,deprecated_pert_id
0,BRD-K76894955-001-02-1,BRD-K76894955,BRD-K94313941-001-01-9,BRD-K94313941
1,BRD-K17443395-065-03-0,BRD-K17443395,BRD-M55852627-065-01-7,BRD-M55852627
2,BRD-K17443395-065-02-2,BRD-K17443395,BRD-A95032015-065-01-2,BRD-A95032015
3,BRD-K17443395-065-01-4,BRD-K17443395,BRD-M42309903-065-03-5,BRD-M42309903
4,BRD-K50071428-001-03-3,BRD-K50071428,BRD-K00111504-001-01-9,BRD-K00111504


In [4]:
# Output file
output_file = os.path.join("repurposing_old_to_new_broad_ids_map.tsv")
deprecated_id_df.to_csv(output_file, sep='\t', index=False)