# Preparation of ICB data

**Authorship:**
Adam Klie, *03/29/2022*
***
**Description:**
Notebook for prepping ICB data for MultiomicDatasetICB class and for model training

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Fix copy number imputation to something more realistic, currently imputed to mean of training set (which means model inputs are 0)</li></b>
    <b><li>Sanity check individuals in each file</li></b>
    </ul>
</div>

In [17]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

import os
import sys
sys.path.append("../drpredict")

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [18]:
ICB_DATA = "/cellar/users/aklie/projects/hackathons/data/multiomic_drug_response"  # ICB data to clean
TRAINING_DATA = "../data/training_top2000"  # Actual training data used for models

## Expression
Missing values mean imputed<br>
Used training stats for z-scoring of icb

In [19]:
# Load training
training_expression = pd.read_csv("{}_raw_expression.tsv".format(TRAINING_DATA), index_col=0, sep="\t")
training_exp_genes = training_expression.columns
training_expression.head()

Unnamed: 0,A1BG,A2M,A4GALT,ABCA3,ABCB1,ABCC2,ABCC3,ABCG1,ABI3BP,ABLIM1,...,ZNF185,ZNF528,ZNF532,ZNF595,ZNF608,ZNF677,ZNF703,ZNF711,ZNF880,ZSCAN18
22RV1_PROSTATE,1.722466,2.0,0.238787,5.285772,0.422233,3.300124,0.356144,0.594549,0.226509,5.321207,...,2.077243,3.168321,2.134221,7.393176,3.107688,3.920293,0.659925,2.615887,3.786596,1.678072
2313287_STOMACH,0.189034,0.014355,1.541019,2.07382,0.201634,5.954196,7.334318,6.214319,0.15056,7.955069,...,4.152183,0.042644,3.521051,0.659925,0.084064,0.070389,3.505891,2.100978,0.475085,0.163499
253JBV_URINARY_TRACT,0.505891,0.238787,4.261531,5.167117,1.475085,2.301588,7.062748,0.641546,0.275007,4.963474,...,1.941106,0.176323,4.275752,1.62293,4.980939,0.0,1.516015,0.028569,0.594549,3.436961
253J_URINARY_TRACT,1.028569,0.042644,3.529821,4.82985,4.374344,2.57289,6.660353,0.992768,0.970854,4.808385,...,1.922198,0.176323,3.838952,1.459432,5.418527,0.014355,1.584963,0.042644,1.327687,4.031219
42MGBA_CENTRAL_NERVOUS_SYSTEM,5.651339,0.163499,1.0,0.163499,0.014355,1.195348,2.778209,0.389567,3.569248,1.042644,...,0.941106,4.346957,5.139142,3.0268,4.604071,3.553361,4.032101,2.807355,2.521051,4.844988


In [20]:
# Load ICB
icb_expression = pd.read_csv(os.path.join(ICB_DATA, "icb_expression.tsv"), index_col=0, sep="\t")
icb_exp_individuals = icb_expression.index

In [21]:
# Harmonize genes and fill with mean if not available
icb_expression_harmonized = pd.concat([training_expression, icb_expression])[training_exp_genes].loc[icb_exp_individuals]
icb_expression_harmonized = icb_expression_harmonized.fillna(icb_expression_harmonized.mean(axis=0)).fillna(0)
icb_expression_harmonized.head()

Unnamed: 0_level_0,A1BG,A2M,A4GALT,ABCA3,ABCB1,ABCC2,ABCC3,ABCG1,ABI3BP,ABLIM1,...,ZNF185,ZNF528,ZNF532,ZNF595,ZNF608,ZNF677,ZNF703,ZNF711,ZNF880,ZSCAN18
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR2682026,0.117333,8.599261,0.634999,2.774474,3.339205,6.621821,1.236723,2.93499,1.334097,3.430294,...,0.0,3.010184,5.799048,0.0,2.666754,0.040744,3.592009,2.629178,3.173845,0.300848
SRR2773005,0.440759,10.684422,2.344377,5.131704,1.939467,1.823216,1.610593,2.207234,3.659484,3.961681,...,0.0,2.542694,5.922764,0.0,6.468791,0.052652,4.372008,0.453212,3.884101,0.408211
SRR2698019,0.0,10.636949,5.239199,4.337606,4.093312,4.902448,3.332994,4.65433,8.326794,6.145893,...,0.0,3.238479,5.205104,0.0,3.909655,0.725125,4.038517,3.674133,3.68577,1.358698
SRR2729983,0.09208,10.785505,1.821852,3.16233,0.603492,6.41728,0.43483,3.766221,0.960958,4.515215,...,0.0,1.929571,1.498074,0.0,4.611191,0.169636,5.92549,0.073215,3.80179,0.170525
SRR2677014,0.184841,10.029525,2.78988,1.819502,3.378697,4.087398,3.446921,3.679218,4.366018,2.146933,...,0.0,2.581993,4.766028,0.0,4.967053,0.102913,4.820432,0.182646,1.667253,0.619849


In [22]:
# Normalize on training data
scaler = StandardScaler()
scaler.fit(training_expression)
icb_exp_norm = pd.DataFrame(scaler.transform(icb_expression_harmonized), index=icb_expression_harmonized.index, columns=icb_expression_harmonized.columns)
icb_exp_norm.head()

Unnamed: 0_level_0,A1BG,A2M,A4GALT,ABCA3,ABCB1,ABCC2,ABCC3,ABCG1,ABI3BP,ABLIM1,...,ZNF185,ZNF528,ZNF532,ZNF595,ZNF608,ZNF677,ZNF703,ZNF711,ZNF880,ZSCAN18
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR2682026,-1.366795,4.791229,-0.943734,0.213995,1.339079,2.519332,-0.801684,0.830008,0.100582,-0.322142,...,-1.524291,0.222034,1.290267,-1.461485,0.052736,-0.699347,0.590218,0.526855,0.466565,-1.044058
SRR2773005,-1.208972,6.066286,0.073933,1.556436,0.494973,0.03014,-0.656356,0.382017,1.496349,-0.020044,...,-1.524291,-0.034164,1.36188,-1.461485,2.174671,-0.693152,1.040542,-0.798626,0.84463,-0.996603
SRR2698019,-1.42405,6.037257,1.797345,1.104197,1.79384,1.627437,0.013165,1.888397,4.297809,1.2217,...,-1.524291,0.347146,0.946464,-1.461485,0.746405,-0.343304,0.848005,1.163385,0.739059,-0.576479
SRR2729983,-1.379118,6.128098,-0.237149,0.434879,-0.310681,2.413229,-1.113391,1.341696,-0.123388,0.294645,...,-1.524291,-0.370174,-1.199345,-1.461485,1.137936,-0.632292,1.937428,-1.0301,0.800817,-1.101663
SRR2677014,-1.333853,5.665823,0.33916,-0.329861,1.362894,1.204645,0.05745,1.288139,1.920432,-1.051744,...,-1.524291,-0.012627,0.692305,-1.461485,1.336544,-0.667004,1.299435,-0.96344,-0.335385,-0.903057


## Copy number
Missing values mean imputed<br>
Used training stats for z-scoring of icb

In [23]:
# Load training
training_cn = pd.read_csv("{}_raw_cn.tsv".format(TRAINING_DATA), index_col=0, sep="\t")
training_cn_genes = training_cn.columns
training_cn.head()

Unnamed: 0,AAMDC,AC000061.1,AC000065.1,AC000065.2,AC000111.1,AC000111.2,AC002066.1,AC002465.1,AC002542.1,AC002542.2,...,ZMPSTE24,ZMYND8,ZNF217,ZNF536,ZNF572,ZNF652,ZNF684,ZNF705G,ZNF713,ZPBP2
22RV1_PROSTATE,0.970197,1.442398,1.442398,1.442398,1.442398,1.442398,1.442398,1.442398,1.442398,1.442398,...,0.977848,0.966654,0.966654,0.987958,0.97972,0.967526,0.977848,1.414128,1.441106,0.967526
2313287_STOMACH,0.935425,0.999227,0.999227,0.999227,0.999227,0.999227,0.999227,0.999227,0.999227,0.999227,...,0.991146,1.720671,1.720671,0.98789,2.094306,0.982077,0.991146,0.863959,0.999227,0.982077
253JBV_URINARY_TRACT,0.811934,1.488565,1.488565,1.488565,1.488565,1.488565,1.488565,1.488565,1.488565,1.488565,...,0.808111,1.175619,1.175619,0.827625,1.14555,1.169014,0.808111,0.80133,1.488565,1.169014
253J_URINARY_TRACT,0.757237,1.472458,1.472458,1.472458,1.472458,1.472458,1.472458,1.472458,1.472458,1.472458,...,0.756745,1.1221,1.1221,1.067416,0.780298,1.104115,0.756745,0.766166,1.472458,1.104115
42MGBA_CENTRAL_NERVOUS_SYSTEM,1.165997,0.78204,1.245202,1.245202,0.78204,0.78204,0.78204,0.78204,0.78204,0.78204,...,1.492869,1.88146,1.88146,1.011981,1.190397,0.790102,1.492869,0.777713,1.316835,0.790102


In [25]:
# Load ICB
icb_cn = pd.read_csv(os.path.join(ICB_DATA, "copy_number/icb_cn.csv"), index_col=0, low_memory=False)
icb_cn_individuals = icb_cn.index
icb_cn.head()

Unnamed: 0_level_0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
nml_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR10801649,0.247222,0.247222,-0.089791,0.18829,0.18829,0.18829,-0.173439,0.145021,0.476957,0.077735,...,0.154816,-0.089791,0.462458,0.462458,0.476957,-0.173439,-0.173439,0.946532,0.170225,-0.023534
SRR10801752,-0.197081,-0.197081,0.035895,0.144636,0.144636,0.144636,-0.103844,0.238842,0.154975,-0.070448,...,0.111233,0.035895,-0.083203,-0.083203,0.154975,-0.103844,-0.103844,0.428383,-0.096447,-0.01958
SRR10801682,0.18578,0.18578,-0.052984,-0.059256,-0.059256,-0.059256,-0.029052,0.001797,0.082506,0.171813,...,0.10772,-0.052984,0.384545,0.384545,0.082506,-0.029052,-0.029052,0.357444,0.060536,-0.029052
SRR10801753,-0.414825,-0.414825,-0.416273,-0.17348,-0.17348,-0.17348,-0.331607,0.060704,-0.460055,0.21107,...,0.290158,-0.416273,-0.09187,-0.09187,-0.426433,-0.531948,-0.531948,0.226498,0.287984,-0.396614
SRR10801689,-0.399262,-0.399262,-0.923063,0.372925,0.372925,0.372925,0.046569,-0.017537,0.064268,-0.280514,...,0.071813,-0.923063,0.380547,0.380547,0.074581,0.022152,0.022152,0.850055,-0.360521,0.002118


In [26]:
# Harmonize genes and fill with mean of training set if not available
icb_cn_harmonized = pd.concat([training_cn, icb_cn])[training_cn_genes].loc[icb_cn_individuals.dropna()]
icb_cn_harmonized = icb_cn_harmonized.fillna(icb_cn_harmonized.mean(axis=0)).fillna(training_cn.mean(axis=0))
icb_cn_harmonized.head()

Unnamed: 0,AAMDC,AC000061.1,AC000065.1,AC000065.2,AC000111.1,AC000111.2,AC002066.1,AC002465.1,AC002542.1,AC002542.2,...,ZMPSTE24,ZMYND8,ZNF217,ZNF536,ZNF572,ZNF652,ZNF684,ZNF705G,ZNF713,ZPBP2
SRR10801649,-0.484696,1.203279,1.29688,1.29688,1.202913,1.19879,1.193875,1.210098,1.210098,1.209967,...,-0.173439,0.178986,0.178986,0.103192,-0.053036,0.167814,-0.173439,1.036177,0.889796,0.167814
SRR10801752,-0.097879,1.203279,1.29688,1.29688,1.202913,1.19879,1.193875,1.210098,1.210098,1.209967,...,-0.103844,-0.095784,-0.095784,-0.197081,0.330348,-0.066504,-0.103844,1.036177,-0.174738,-0.117237
SRR10801682,-0.000268,1.203279,1.29688,1.29688,1.202913,1.19879,1.193875,1.210098,1.210098,1.209967,...,-0.029052,-0.017924,-0.017924,0.009913,0.153341,0.07029,-0.029052,1.036177,0.00647,0.07029
SRR10801753,-0.488745,1.203279,1.29688,1.29688,1.202913,1.19879,1.193875,1.210098,1.210098,1.209967,...,-0.377594,0.300638,0.300638,-0.57539,0.109315,-0.277574,-0.377594,1.036177,0.956234,0.389573
SRR10801689,0.04229,1.203279,1.29688,1.29688,1.202913,1.19879,1.193875,1.210098,1.210098,1.209967,...,0.022152,-0.017534,-0.017534,-0.513936,0.828157,-0.30946,0.022152,1.036177,-0.007326,-0.30946


In [27]:
# Z-score based on training data statistics
scaler = StandardScaler()
scaler.fit(training_cn)
icb_cn_norm = pd.DataFrame(scaler.transform(icb_cn_harmonized), index=icb_cn_harmonized.index, columns=icb_cn_harmonized.columns)
icb_cn_norm.head()

Unnamed: 0,AAMDC,AC000061.1,AC000065.1,AC000065.2,AC000111.1,AC000111.2,AC002066.1,AC002465.1,AC002542.1,AC002542.2,...,ZMPSTE24,ZMYND8,ZNF217,ZNF536,ZNF572,ZNF652,ZNF684,ZNF705G,ZNF713,ZPBP2
SRR10801649,-2.21011,5.299579e-16,4.101689e-16,4.101689e-16,2.649899e-16,2.657088e-16,2.912571e-16,2.533636e-16,2.533636e-16,0.0,...,-0.924486,-1.450066,-1.29115,-1.552966,-0.806767,-1.630764,-1.033122,-5.768377e-16,-0.283136,-0.474741
SRR10801752,-1.677456,5.299579e-16,4.101689e-16,4.101689e-16,2.649899e-16,2.657088e-16,2.912571e-16,2.533636e-16,2.533636e-16,0.0,...,-0.875894,-1.812401,-1.598599,-2.013531,-0.58796,-2.014053,-0.977815,-5.768377e-16,-0.917854,-0.589782
SRR10801682,-1.543044,5.299579e-16,4.101689e-16,4.101689e-16,2.649899e-16,2.657088e-16,2.912571e-16,2.533636e-16,2.533636e-16,0.0,...,-0.823674,-1.709728,-1.511479,-1.696039,-0.688983,-1.790289,-0.918379,-5.768377e-16,-0.809811,-0.5141
SRR10801753,-2.215686,5.299579e-16,4.101689e-16,4.101689e-16,2.649899e-16,2.657088e-16,2.912571e-16,2.533636e-16,2.533636e-16,0.0,...,-1.067028,-1.289646,-1.15503,-2.593789,-0.71411,-2.359312,-1.195361,-5.768377e-16,-0.243523,-0.385244
SRR10801689,-1.484439,5.299579e-16,4.101689e-16,4.101689e-16,2.649899e-16,2.657088e-16,2.912571e-16,2.533636e-16,2.533636e-16,0.0,...,-0.787923,-1.709214,-1.511043,-2.499529,-0.303847,-2.41147,-0.877688,-5.768377e-16,-0.818036,-0.66736


## Mutation
Missing values imputed to 0

In [28]:
training_mutation = pd.read_csv("{}_mutations.tsv".format(TRAINING_DATA), index_col=0, sep="\t")
training_mutation_genes = training_mutation.columns
training_mutation.head()

Unnamed: 0,A2M,A2ML1,AATK,ABCA1,ABCA12,ABCA13,ABCA2,ABCA3,ABCA4,ABCA5,...,ZNF99,ZNFX1,ZP4,ZRANB3,ZSCAN10,ZSCAN18,ZSCAN20,ZSWIM6,ZSWIM8,ZZEF1
22RV1_PROSTATE,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2313287_STOMACH,1,1,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,1,0,0,0
253JBV_URINARY_TRACT,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
253J_URINARY_TRACT,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
42MGBA_CENTRAL_NERVOUS_SYSTEM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [29]:
# Harmonize gene names and imppute nas with 0s (i.e. assume no mutation)
icb_mutation = pd.read_csv(os.path.join(ICB_DATA, "mutation/icb_mutations.csv"), index_col=0)
icb_mutation_individuals = icb_mutation.index
icb_mutation_harmonized = pd.concat([training_mutation, icb_mutation])[training_mutation_genes].loc[icb_mutation_individuals.dropna()]
icb_mutation_harmonized = icb_mutation_harmonized.fillna(0)
icb_mutation_harmonized.head()

Unnamed: 0_level_0,A2M,A2ML1,AATK,ABCA1,ABCA12,ABCA13,ABCA2,ABCA3,ABCA4,ABCA5,...,ZNF99,ZNFX1,ZP4,ZRANB3,ZSCAN10,ZSCAN18,ZSCAN20,ZSWIM6,ZSWIM8,ZZEF1
nml_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR2648152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR5134809,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
SRR5134756,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SRR2648106,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR2660259,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


## Metadata

In [31]:
# Load metadata
icb_metadata = pd.read_csv(os.path.join(ICB_DATA, "icb_metadata.tsv"), sep="\t", index_col=0)
intersect_patients = list(set.intersection(set.intersection(set(icb_cn_norm.index), set(icb_exp_norm.index)), set(icb_mutation_harmonized.index)))
icb_metadata.columns

Index(['study', 'patient id', 'Age', 'Gender', 'Response', 'OS', 'OS.time',
       'PFS', 'PFS.time', 'Biopsy Time', 'tumor RNA id', 'BCR_Shannon',
       'TCR_Shannon', 'BCR_Richness', 'TCR_Richness', 'BCR_Evenness',
       'TCR_Evenness', 'tumor WXS id', 'cancer', 'partial.coding',
       'partial.sd.coding', 'complete.coding', 'partial.sd.surv.coding',
       'nonresponder.sd.coding', 'response_crist_sd', 'response_crist_sd_surv',
       'response_crist_partial', 'response_crist_complete', 'study_cancer'],
      dtype='object')

In [None]:
# Labels file - requires dummy AUC column (or real if you have one)
icb_final = icb_metadata.loc[intersect_patients]
icb_final["auc"] = 0
icb_final.to_csv("../data/icb_top2000_labels.tsv", sep="\t", index=True)

In [168]:
# Expression is easy
icb_exp_final = icb_exp_norm.loc[intersect_patients]
icb_exp_final.to_csv("../data/icb_top2000_expression.tsv", sep="\t", index=True)

In [256]:
# Mutation has some duplicate individuals for some reason
icb_mutation_final = icb_cn_harmonized.loc[intersect_patients]
icb_mutation_final = icb_mutation_final[~icb_mutation_final.index.duplicated()]
icb_mutation_final.to_csv("../data/icb_top2000_mutations.tsv", sep="\t", index=True)

In [169]:
# Cn also has some duplicated individuals
icb_cn_final = icb_cn_norm.loc[intersect_patients]
icb_cn_final = icb_cn_final[~icb_cn_final.index.duplicated()]
icb_cn_final.to_csv("../data/icb_top2000_cn.tsv", sep="\t", index=True)

# ICB Drug encoding

In [208]:
# Test for nivo fingerprint
val = pd.read_csv("/cellar/users/aklie/projects/hackathons/multiomic_drug_response/icb_files/nivo2fingerprint.txt", header=None).iloc[0].values