# EDA and post-processing

In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
assert sys.version_info >= (3, 5)

from pathlib import Path
from pprint import pprint
import glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import (confusion_matrix, balanced_accuracy_score, f1_score, matthews_corrcoef,
                             roc_auc_score, recall_score, precision_score, precision_recall_curve)

import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Seed
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

fdir = Path.cwd()
print(fdir)
# sys.path.append(str(fdir/'../src'))
sys.path.append(str(fdir/'..'))
import src
# from config import cfg
from src.config import cfg
from src.ml.scale import get_scaler
from src.utils.utils import Params, dump_dict, read_lines, cast_list, Timer
from src.datasets.tidy import split_data_and_extract_fea, extract_fea

from src.tf_utils import (calc_records_in_tfr_folder, calc_examples_in_tfrecord, get_tfr_files,
                          _float_feature, _bytes_feature, _int64_feature)
from src.sf_utils import read_annotations, green, parse_tfrec_fn_rsp, create_tf_data
from src.tfrecords import FEA_SPEC_RNA, FEA_SPEC_RSP_DRUG_PAIR

print_fn = print
split_on = "Group"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/vol/ml/apartin/projects/pdx-histo/nbs


In [9]:
# Load data
prjname = "bin_rsp_drug_pairs_all_samples"
dataname = "tidy_drug_pairs_all_samples"
prjdir = cfg.MAIN_PRJDIR/prjname
annotations_file = cfg.DATA_PROCESSED_DIR/dataname/cfg.SF_ANNOTATIONS_FILENAME
data = pd.read_csv(annotations_file)
print(data.shape)

(6962, 4950)


  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
# Scalers for each feature set
ge_scaler, dd1_scaler, dd2_scaler = None, None, None

ge_cols  = [c for c in data.columns if c.startswith("ge_")]
dd1_cols = [c for c in data.columns if c.startswith("dd1_")]
dd2_cols = [c for c in data.columns if c.startswith("dd2_")]

ge_scaler = get_scaler(data[ge_cols])
dd1_scaler = get_scaler(data[dd1_cols])
dd2_scaler = get_scaler(data[dd2_cols])

In [11]:
splitdir = cfg.DATADIR/"PDX_Transfer_Learning_Classification/Processed_Data/Data_For_MultiModal_Learning/Data_Partition_Drug_Specific"
splitdir = splitdir/"NSC.125973-NSC.125973"  # params.drug_specific
split_id = 0

In [12]:
tr_id = cast_list(read_lines(str(splitdir/f"cv_{split_id}"/"TrainList.txt")), int)
vl_id = cast_list(read_lines(str(splitdir/f"cv_{split_id}"/"ValList.txt")), int)
te_id = cast_list(read_lines(str(splitdir/f"cv_{split_id}"/"TestList.txt")), int)

# Update ids
index_col_name = "index"
tr_id = sorted(set(data[index_col_name]).intersection(set(tr_id)))
vl_id = sorted(set(data[index_col_name]).intersection(set(vl_id)))
te_id = sorted(set(data[index_col_name]).intersection(set(te_id)))

In [13]:
kwargs = {"ge_cols": ge_cols,
          "dd1_cols": dd1_cols,
          "dd2_cols": dd2_cols,
          "ge_scaler": ge_scaler,
          "dd1_scaler": dd1_scaler,
          "dd2_scaler": dd2_scaler,
          "ge_dtype": cfg.GE_DTYPE,
          "dd_dtype": cfg.DD_DTYPE,
          "index_col_name": index_col_name,
          "split_on": split_on
          }
tr_ge, tr_dd1, tr_dd2, tr_meta = split_data_and_extract_fea(data, ids=tr_id, **kwargs)
vl_ge, vl_dd1, vl_dd2, vl_meta = split_data_and_extract_fea(data, ids=vl_id, **kwargs)
te_ge, te_dd1, te_dd2, te_meta = split_data_and_extract_fea(data, ids=te_id, **kwargs)

# tr_meta.to_csv(outdir/"tr_meta.csv", index=False)
# vl_meta.to_csv(outdir/"vl_meta.csv", index=False)
# te_meta.to_csv(outdir/"te_meta.csv", index=False)

ge_shape = (tr_ge.shape[1],)
dd_shape = (tr_dd1.shape[1],)

# import ipdb; ipdb.set_trace()
print_fn("\nTrain:")
print_fn(tr_meta.groupby(["ctype", "Response"]).agg({"grp_name": "nunique", "smp": "nunique"}).reset_index())
print_fn("\nValidation:")
print_fn(vl_meta.groupby(["ctype", "Response"]).agg({"grp_name": "nunique", "smp": "nunique"}).reset_index())
print_fn("\nTest:")
print_fn(te_meta.groupby(["ctype", "Response"]).agg({"grp_name": "nunique", "smp": "nunique"}).reset_index())

# Make sure indices do not overlap
assert len( set(tr_id).intersection(set(vl_id)) ) == 0, "Overlapping indices btw tr and vl"
assert len( set(tr_id).intersection(set(te_id)) ) == 0, "Overlapping indices btw tr and te"
assert len( set(vl_id).intersection(set(te_id)) ) == 0, "Overlapping indices btw tr and vl"

# Print split ratios
print_fn("")
print_fn("Train samples {} ({:.2f}%)".format( len(tr_id), 100*len(tr_id)/data.shape[0] ))
print_fn("Val   samples {} ({:.2f}%)".format( len(vl_id), 100*len(vl_id)/data.shape[0] ))
print_fn("Test  samples {} ({:.2f}%)".format( len(te_id), 100*len(te_id)/data.shape[0] ))

tr_grp_unq = set(tr_meta[split_on].values)
vl_grp_unq = set(vl_meta[split_on].values)
te_grp_unq = set(te_meta[split_on].values)
print_fn("")
print_fn(f"Total intersects on {split_on} btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}")
print_fn(f"Total intersects on {split_on} btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}")
print_fn(f"Total intersects on {split_on} btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}")
print_fn(f"Unique {split_on} in tr: {len(tr_grp_unq)}")
print_fn(f"Unique {split_on} in vl: {len(vl_grp_unq)}")
print_fn(f"Unique {split_on} in te: {len(te_grp_unq)}")


Train:
                        ctype  Response  grp_name  smp
0          bladder/urothelial         0         1    5
1  digestive/gastrointestinal         0         4   22
2                 gynecologic         0         2   10
3                      kidney         0         1    5
4                        lung         1         1    6
5                    melanoma         0         2    7
6        sarcoma/mesothelioma         0         2   10
7                    squamous         0         5   31

Validation:
                          ctype  Response  grp_name  smp
0            bladder/urothelial         0         1    6
1    digestive/gastrointestinal         0         7   33
2  endocrine and neuroendocrine         0         1    6
3                        kidney         0         1    3
4                      melanoma         0         1    6
5          sarcoma/mesothelioma         0         3   13
6                    skin other         1         1    3
7                      squam