Yitan, I was trying to reproduce your drug response dataset (Drug_Pair_Response.txt), but for some reason I end up with 8013 samples, while your dataset contains 6962 (it's 1051 samples more). I confirmed that we're using the same source dataset as a starting point (ncipdm_drug_response).
Here is just one example of a Sample that you don't have but I do:
Sample:  114551~080-T~M667M226C29
Drug1:   NSC.777586
Drug2:   NSC.756642

This sample is available in ncipdm_drug_response and also has image_id and gene expression in cref_rna.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys

from pathlib import Path
import pandas as pd
import numpy as np

fdir = Path.cwd()
print(fdir)
sys.path.append(str(fdir/'../src'))

/vol/ml/apartin/projects/pdx-histo/nbs


In [2]:
# repo_datadir = Path("/vol/ml/apartin/projects/pdx-histo/data/studies")
# rr = pd.read_csv(repo_datadir/"pdm/ncipdm_drug_response", sep="\t")

# Source drug response data from Maulik (that Yitan was using)
datadir = Path("/vol/ml/apartin/projects/pdx-histo/data/PDX_Transfer_Learning_Classification/Processed_Data/Data_For_MultiModal_Learning")
rr = pd.read_csv(datadir/"ncipdm_drug_response", sep="\t")

print(rr.shape)
rr = rr.drop_duplicates().reset_index(drop=True)

# Remove PDX samples that were generated from cryo-preserved samples
# https://pdmr.cancer.gov/database/default.htm
rr = rr[rr["Sample"].map(lambda s: True if "RG" not in s else False)].reset_index(drop=True)

# Remove 'NCIPDM.'
rr = rr.rename(columns={"Model": "model"})
rr['model'] = rr['model'].map(lambda x: x.split('NCIPDM.')[1])
rr['Sample'] = rr['Sample'].map(lambda x: x.split('NCIPDM.')[1])
rr = rr.drop(columns="Source")

rr = rr.astype(str)
rr = rr.astype({"Response": np.int64})

print(rr.shape)
rr[:2]

(5379, 6)
(5282, 5)


Unnamed: 0,model,Sample,Drug1,Drug2,Response
0,114434~197-R,114434~197-R~A35YC3,NSC.125973,,0
1,114434~197-R,114434~197-R~A35YC3,NSC.19893,,0


In [3]:
# where this one is coming from??
rr[(rr["Sample"] == "114551~080-T~M667M226C29") & (rr["Drug1"] == "NSC.777586") & (rr["Drug2"] == "NSC.756642")]

Unnamed: 0,model,Sample,Drug1,Drug2,Response
33,114551~080-T,114551~080-T~M667M226C29,NSC.777586,NSC.756642,0


In [4]:
# Copy Drug1 to Drug2 in case of single drug treatments
drug2 = []

for i, (d1, d2) in enumerate(zip(rr["Drug1"], rr["Drug2"])):
    if isinstance(d2, str) and d2.startswith("NSC."):
        drug2.append(d2)  # drug pair
    else:
        drug2.append(d1)  # single drug; copy to drug1 to drug2

rr["Drug2"] = drug2
display(rr[:2])
print(rr.shape)

Unnamed: 0,model,Sample,Drug1,Drug2,Response
0,114434~197-R,114434~197-R~A35YC3,NSC.125973,NSC.125973,0
1,114434~197-R,114434~197-R~A35YC3,NSC.19893,NSC.19893,0


(5282, 5)


In [5]:
# Create drug treatment string ids
rr["trt"] = ["_".join(sorted([d1, d2])) for d1, d2 in zip(rr["Drug1"], rr["Drug2"])]
display(rr[:2])
print(rr.shape)

Unnamed: 0,model,Sample,Drug1,Drug2,Response,trt
0,114434~197-R,114434~197-R~A35YC3,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973
1,114434~197-R,114434~197-R~A35YC3,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893


(5282, 6)


In [6]:
# Create treatment groups (specimen-treatment pairs)
gg = rr[["model", "trt"]].drop_duplicates(subset=["model", "trt"]).reset_index(drop=True)
gg = gg.reset_index().rename(columns={"index": "grp"})
gg = gg[["model", "trt", "grp"]]
print(gg.shape)
print(gg.grp.unique()[:20])
display(gg[:2])

(1048, 3)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


Unnamed: 0,model,trt,grp
0,114434~197-R,NSC.125973_NSC.125973,0
1,114434~197-R,NSC.19893_NSC.19893,1


In [7]:
# Assign treatment groups into master df
rr = rr.merge(gg, on=["model", "trt"], how="inner")
print(rr.shape)
display(rr[:15])

(5282, 7)


Unnamed: 0,model,Sample,Drug1,Drug2,Response,trt,grp
0,114434~197-R,114434~197-R~A35YC3,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0
1,114434~197-R,114434~197-R~A36YC9,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0
2,114434~197-R,114434~197-R~A38WG0JH1,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0
3,114434~197-R,114434~197-R~A38WG3J91,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0
4,114434~197-R,114434~197-R~A38WG5,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0
5,114434~197-R,114434~197-R~A35YC3,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893,1
6,114434~197-R,114434~197-R~A36YC9,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893,1
7,114434~197-R,114434~197-R~A38WG0JH1,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893,1
8,114434~197-R,114434~197-R~A38WG3J91,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893,1
9,114434~197-R,114434~197-R~A38WG5,NSC.19893,NSC.19893,0,NSC.19893_NSC.19893,1


In [8]:
# Augment drug-pair treatments
rr = rr.reset_index(drop=True)  # reset index just in case
rr["aug"] = False

# Find ids of drug-pair treatments
aug_ids = [ii for ii, (d1, d2) in enumerate(zip(rr["Drug1"], rr["Drug2"])) if d1 != d2]
df_aug = rr.loc[aug_ids]
df_aug = df_aug.rename(columns={"Drug1": "Drug2", "Drug2": "Drug1"})
df_aug["aug"] = True
print(df_aug.shape)
display(df_aug[:2])

(2709, 8)


Unnamed: 0,model,Sample,Drug2,Drug1,Response,trt,grp,aug
20,114551~080-T,114551~080-T~M667M226C29,NSC.125973,NSC.747599,0,NSC.125973_NSC.747599,4,True
21,114551~080-T,114551~080-T~M667M227C30,NSC.125973,NSC.747599,0,NSC.125973_NSC.747599,4,True


In [9]:
# Create and save the final drug response dataset
rsp_drug_pair = pd.concat([rr, df_aug], axis=0)
rsp_drug_pair = rsp_drug_pair.sort_values(["grp", "aug", "Sample"]).reset_index(drop=True)
print(rsp_drug_pair.shape)
print(rsp_drug_pair.Response.value_counts())
rsp_drug_pair.to_csv("rsp_drug_pair.csv", index=False)

(7991, 8)
0    7543
1     448
Name: Response, dtype: int64


## Check that me and Yitan have the same dataframes

In [11]:
# Response
datadir = Path("/vol/ml/apartin/projects/pdx-histo/data/PDX_Transfer_Learning_Classification/Processed_Data/Data_For_MultiModal_Learning")
rsp = pd.read_csv(datadir/"Drug_Pair_Response.txt", sep="\t")
rsp['Sample'] = rsp['Sample'].map(lambda x: x.split('NCIPDM.')[1])
print(rsp.shape)

(6962, 6)


In [12]:
rsp[:2]

Unnamed: 0,Sample,Image_ID,Drug1,Drug2,Response,Group
0,114434~197-R~A35YC3,27166,NSC.125973,NSC.125973,0,0
1,114434~197-R~A36YC9,25127,NSC.125973,NSC.125973,0,0


In [13]:
rsp_drug_pair[:2]

Unnamed: 0,model,Sample,Drug1,Drug2,Response,trt,grp,aug
0,114434~197-R,114434~197-R~A35YC3,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0,False
1,114434~197-R,114434~197-R~A36YC9,NSC.125973,NSC.125973,0,NSC.125973_NSC.125973,0,False


In [14]:
# -------------------
# Explore (merge and identify from which df the items are coming from)
# https://kanoki.org/2019/07/04/pandas-difference-between-two-dataframes/
# --------
mrg_cols = ["Sample", "Drug1", "Drug2", "Response"]

mrg_outer = rsp_drug_pair.merge(rsp, on=mrg_cols, how='outer', indicator=True)
print('Outer merge', mrg_outer.shape)
print(mrg_outer['_merge'].value_counts())

miss_r = mrg_outer.loc[lambda x: x['_merge']=='right_only']
miss_r = miss_r.sort_values(mrg_cols, ascending=True)
print('Missing right items', miss_r.shape)

miss_l = mrg_outer.loc[lambda x: x['_merge']=='left_only']
miss_l = miss_l.sort_values(mrg_cols, ascending=True)
print('Missing left items', miss_l.shape)

Outer merge (7991, 11)
both          6962
left_only     1029
right_only       0
Name: _merge, dtype: int64
Missing right items (0, 11)
Missing left items (1029, 11)


In [15]:
# Missing samples in Yitan's dataset
display(miss_l[:4])
miss_l.Response.value_counts()

Unnamed: 0,model,Sample,Drug1,Drug2,Response,trt,grp,aug,Image_ID,Group,_merge
125,114551~080-T,114551~080-T~M667M226C29,NSC.756642,NSC.777586,0,NSC.756642_NSC.777586,17,True,,,left_only
135,114551~080-T,114551~080-T~M667M226C29,NSC.764276,NSC.777586,0,NSC.764276_NSC.777586,18,True,,,left_only
120,114551~080-T,114551~080-T~M667M226C29,NSC.777586,NSC.756642,0,NSC.756642_NSC.777586,17,False,,,left_only
130,114551~080-T,114551~080-T~M667M226C29,NSC.777586,NSC.764276,0,NSC.764276_NSC.777586,18,False,,,left_only


0    912
1    117
Name: Response, dtype: int64

In [16]:
# why Yitan's df is missing this sample??
rsp[(rsp["Sample"] == "114551~080-T~M667M226C29") &
    (rsp["Drug1"] == "NSC.777586") &
    (rsp["Drug2"] == "NSC.756642")]

Unnamed: 0,Sample,Image_ID,Drug1,Drug2,Response,Group


In [17]:
rsp_drug_pair[(rsp_drug_pair["Sample"] == "114551~080-T~M667M226C29") & 
              (rsp_drug_pair["Drug1"] == "NSC.777586") &
              (rsp_drug_pair["Drug2"] == "NSC.756642")]

Unnamed: 0,model,Sample,Drug1,Drug2,Response,trt,grp,aug
120,114551~080-T,114551~080-T~M667M226C29,NSC.777586,NSC.756642,0,NSC.756642_NSC.777586,17,False


In [19]:
datadir = Path("/vol/ml/apartin/projects/pdx-histo/data/PDX_Transfer_Learning_Classification/Processed_Data/Data_For_MultiModal_Learning")
rsp_org = pd.read_csv(datadir/"ncipdm_drug_response", sep="\t")
rsp_org['Sample'] = rsp_org['Sample'].map(lambda x: x.split('NCIPDM.')[1])
print(rsp_org.shape)
rsp_org[(rsp_org["Sample"] == "114551~080-T~M667M226C29") &
        (rsp_org["Drug1"] == "NSC.777586") &
        (rsp_org["Drug2"] == "NSC.756642")]

(5379, 6)


Unnamed: 0,Source,Model,Sample,Drug1,Drug2,Response
35,NCIPDM,NCIPDM.114551~080-T,114551~080-T~M667M226C29,NSC.777586,NSC.756642,0


In [21]:
datadir = Path("/vol/ml/apartin/projects/pdx-histo/data/PDX_Transfer_Learning_Classification/Processed_Data/Data_For_MultiModal_Learning")
cref_rna = pd.read_csv(datadir/"cref_rna.csv")
cref_rna[cref_rna["Sample"] == "114551~080-T~M667M226C29"]

Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,Sample,ge_AARS,ge_ABCB6,ge_ABCC5,ge_ABCF1,...,ge_ZMIZ1,ge_ZMYM2,ge_ZNF131,ge_ZNF274,ge_ZNF318,ge_ZNF395,ge_ZNF451,ge_ZNF586,ge_ZNF589,ge_ZW10
8,114551~080-T,114551,080-T,M667M226C29,16142,114551~080-T~M667M226C29,11.62224,8.220932,7.306632,9.989162,...,9.644486,7.913247,6.899494,7.100257,7.336669,9.721398,6.235037,7.321681,7.391286,8.801201


In [22]:
print(rsp_drug_pair.model.nunique())
print(cref_rna.model.nunique())

97
99


In [23]:
rsp_samples = set(rsp_drug_pair.model.values)
rna_samples = set(cref_rna.model.values)
len(rsp_samples.intersection(rna_samples))
print(rsp_samples.difference(rna_samples))
print(rna_samples.difference(rsp_samples))

set()
{'237351~077-R', '146476~266-R'}


In [24]:
cref_rna[cref_rna["Sample"] == "114551~080-T~M667M226C29"]

Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,Sample,ge_AARS,ge_ABCB6,ge_ABCC5,ge_ABCF1,...,ge_ZMIZ1,ge_ZMYM2,ge_ZNF131,ge_ZNF274,ge_ZNF318,ge_ZNF395,ge_ZNF451,ge_ZNF586,ge_ZNF589,ge_ZW10
8,114551~080-T,114551,080-T,M667M226C29,16142,114551~080-T~M667M226C29,11.62224,8.220932,7.306632,9.989162,...,9.644486,7.913247,6.899494,7.100257,7.336669,9.721398,6.235037,7.321681,7.391286,8.801201


In [None]:
# RNA
ge = pd.read_csv(datadir/"Data_Before_Normalization.txt", sep="\t")
# ge = pd.read_csv(datadir/"Data_After_Normalization.txt", sep="\t")

ge = ge.T
ge.columns = ge.iloc[0, :]
ge = ge.reset_index().rename(columns={"index": "Sample"})
ge = ge[1:]
ge = ge.sort_values("Sample").reset_index(drop=True)

print(ge.shape)
display(ge[:2])

In [44]:
# Descriptors
dd = pd.read_csv(datadir/"Standardized_Filtered_Drug_Descriptor.txt", sep="\t")

print(dd.shape)
display(dd[:2])

(51, 1994)


Unnamed: 0,Drug,MW,AMW,Sp,Si,Mp,Mi,GD,nAT,nSK,...,F10[Cl-B],Uc,Ui,Hy,TPSA(NO),TPSA(Tot),SAacc,SAdon,Vx,VvdwMG
0,NSC.119875,-1.016132,6.391122,-1.784666,-1.817551,5.860684,2.185096,5.3587,-1.825539,-2.167443,...,-0.140028,-4.216219,-4.607371,2.960688,-1.428681,-1.430001,-2.164965,-1.338042,-1.69087,-1.690864
1,NSC.123127,0.656604,-0.146676,0.414969,0.431454,-0.453737,-0.496263,-0.536775,0.442488,0.621948,...,-0.140028,0.080286,0.088246,2.27798,2.41799,2.180521,2.632592,3.292942,0.486755,0.48676


In [47]:
# Descriptors
rna = pd.read_csv(datadir/"Standardized_Normalized_RNA-seq_Data_lincs1000.txt", sep="\t")
# rna = pd.read_csv(datadir/"Standardized_Unnormalized_RNA-seq_Data_lincs1000.txt", sep="\t")

print(rna.shape)
display(rna[:2])

(494, 943)


Unnamed: 0,samples,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,NCIPDM.114434~197-R~A38WG0JH1,0.725539,-0.049166,0.527175,-0.295612,0.939099,1.991644,2.026727,-0.332576,1.528807,...,0.064793,1.133724,1.038655,0.824658,-0.23852,-0.146249,1.004284,1.31866,-0.067105,0.089109
1,NCIPDM.114434~197-R~A38WG3J91,1.85469,0.079387,0.671868,0.543544,1.349651,1.592091,1.856682,1.161365,1.164606,...,1.355155,1.446011,0.592178,0.72953,0.772658,1.919372,0.556639,1.402248,1.270158,0.218368
