# Drop Ambiguous Samples

In [1]:
!which python

/Users/apartin/anaconda3/envs/p1_mac/bin/python


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from glob import glob

import sklearn
import numpy as np
import pandas as pd
from math import sqrt

from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay  # error

TO_PLOT = False

In [3]:
runs_dir_name = 'out_cls_r0'
# runs_dir_name = 'out_lgbm_cls'
# runs_dir_name = 'out_lgbm_cls_811_v1'
file_path = Path.cwd()
print(file_path)

runs_dir_path = Path(file_path / '..' / runs_dir_name)
runs_dirs = [Path(p) for p in glob(str(runs_dir_path/'run_*'))]

/Users/apartin/Box Sync/projects/DrugResponseViz/notebooks


In [4]:
# Load cell metadata
cancer_types = pd.read_csv(file_path/'../data/combined_cancer_types', sep='\t', names=['CELL', 'CTYPE'])
print(cancer_types.shape)
display(cancer_types[:2])

(14590, 2)


Unnamed: 0,CELL,CTYPE
0,CCLE.ALLSIL,Acute_Lymphoblastic_Leukemia
1,CCLE.DND41,Acute_Lymphoblastic_Leukemia


In [5]:
def plot_target(data_to_plot, y_name, x_name, kind='box', hue_name=None, height=None, aspect=None):
    """ Plot function. """
    if hue_name:
        g = sns.catplot(data=data_to_plot, y=y_name, x=x_name, kind=kind, hue=hue_name, **args);
    else:
        g = sns.catplot(data=data_to_plot, y=y_name, x=x_name, kind=kind, **args);
    return g

# Aggregate predictions from all runs (shuffles/splits)

In [6]:
def reorg_cols(df, col_first:str):
    """
    Args:
        col_first : col name to put first
    """
    cols = df.columns.tolist()
    cols.remove(col_first)
    return df[[col_first] + cols]
    
def agg_preds_from_cls_runs(runs_dirs, phase='_te.csv', verbose=False):
    """ Aggregate predictions bootstraped ML trainings. """
    prd = []
    for i, dir_name in enumerate(runs_dirs):
        if '_tr.csv' in phase:
            prd_ = pd.read_csv(dir_name/'preds_tr.csv')
        elif '_vl.csv' in phase:
            prd_ = pd.read_csv(dir_name/'preds_vl.csv')
        elif '_te.csv' in phase:
            prd_ = pd.read_csv(dir_name/'preds_te.csv')
        
        # prd_te_['err'] = abs(prd_te_['y_true'] - prd_te_['y_pred'])      # add col 'err'
        prd_['run'] = str(dir_name).split(os.sep)[-1].split('_')[-1]  # add col 'run' identifier
        prd.append(prd_)  # append run data

        if verbose:
            if i%20==0:
                print(f'Processing {dir_name}')
            
    # Aggregate to df
    prd = pd.concat(prd, axis=0)
    
    # Reorganize cols
    prd = reorg_cols(prd, col_first='run').sort_values('run').reset_index(drop=True).reset_index().rename(columns={'index': 'idx'})
    return prd

In [7]:
# Concat preds from all runs      
prd_te_all = agg_preds_from_cls_runs(runs_dirs, phase='_te.csv')
prd_te_all.insert(loc=2, column='source', value=[s.split('.')[0].lower() for s in prd_te_all['CELL']]) # add 'source' column

print(prd_te_all.shape)
display(prd_te_all[:2])

# Add CTYPE columns
prd_te_all = pd.merge(prd_te_all, cancer_types, on='CELL')
prd_te_all = reorg_cols(prd_te_all, col_first='CTYPE')
display( prd_te_all[:2] )

(3691900, 8)


Unnamed: 0,idx,run,source,CELL,DRUG,AUC,y_true,y_pred
0,0,s000,ctrp,CTRP.ESS-1,CTRP.445,0.9245,0,0.001339
1,1,s000,ctrp,CTRP.HD-MY-Z,CTRP.182,0.4161,1,0.580708


Unnamed: 0,CTYPE,idx,run,source,CELL,DRUG,AUC,y_true,y_pred
0,Uterine_Corpus_Endometrial_Carcinoma,0,s000,ctrp,CTRP.ESS-1,CTRP.445,0.9245,0,0.001339
1,Uterine_Corpus_Endometrial_Carcinoma,32,s000,ctrp,CTRP.ESS-1,CTRP.147,0.9428,0,0.020743


# Ambiguous samples

Some samples contain ambiguous true labels (both 0 and 1)

In [8]:
dir_name = runs_dirs[0]
prd_tr_ = pd.read_csv(dir_name/'preds_tr.csv')
prd_vl_ = pd.read_csv(dir_name/'preds_vl.csv')
prd_te_ = pd.read_csv(dir_name/'preds_te.csv')
rsp = pd.concat([prd_tr_, prd_vl_, prd_te_], axis=0)
rsp = rsp.reset_index().rename(columns={'index': 'idx'})
rsp.insert(loc=1, column='source', value=[s.split('.')[0].lower() for s in rsp['CELL']]) # add 'source' column

print(rsp.shape)
display(rsp[:2])

(369193, 7)


Unnamed: 0,idx,source,CELL,DRUG,AUC,y_true,y_pred
0,0,ctrp,CTRP.SNU-81,CTRP.153,1.0,0,0.00089
1,1,gdsc,GDSC.MDA-MB-175-VII,GDSC.1241,1.0,0,0.000348


In [9]:
df = rsp.groupby(['CELL', 'DRUG']).agg({'y_true': np.unique}).reset_index()
# df = prd_te_all.groupby(['CELL', 'DRUG']).agg({'y_true': np.unique}).reset_index()

df.insert(loc=1, column='source', value=[s.split('.')[0].lower() for s in df['CELL']]) # add 'source' column
# print(display(df.dtypes))

print('\nSome samples contain ambiguous true labels (both 0 and 1).')
print(df.y_true.value_counts()[:4])

print('\nThe unique types')
print(np.unique([str(type(x)) for x in df.y_true]))

print('\nCreate col indicating the number of unique responses per sample.')
df['y_true_unq_vals'] = df.y_true.map(lambda x: len(x) if type(x)==np.ndarray else 1)

print('\nPrint bincount.')
print(df.y_true_unq_vals.value_counts())

print('\nExtract ambiguous samples.')
df_amb = df[ df.y_true_unq_vals > 1 ].reset_index(drop=True)
print(df_amb.shape)
display(df_amb[:2])


Some samples contain ambiguous true labels (both 0 and 1).
0    356293
1     12900
Name: y_true, dtype: int64

The unique types
["<class 'int'>"]

Create col indicating the number of unique responses per sample.

Print bincount.
1    369193
Name: y_true_unq_vals, dtype: int64

Extract ambiguous samples.
(0, 5)


Unnamed: 0,CELL,source,DRUG,y_true,y_true_unq_vals


In [10]:
def show_amb_samples(data, cell_name, drug_name):
    data = data.copy()
    return data[ (data.CELL==cell_name) & (data.DRUG==drug_name) ]

i = 0
kk = show_amb_samples(rsp, cell_name=df_amb.loc[i,'CELL'], drug_name=df_amb.loc[i,'DRUG'])
# kk['AUC'].value_counts()

KeyError: 0

In [11]:
kk

NameError: name 'kk' is not defined

In [21]:
# Find indices of all the ambiguous samples 
idx = np.array([False for i in range(rsp.shape[0])])
for i in range(df_amb.shape[0]):
    cell_name = df_amb.loc[i,'CELL']
    drug_name = df_amb.loc[i,'DRUG']
    idx_ = (rsp.CELL==cell_name) & (rsp.DRUG==drug_name)
    idx = idx | idx_

In [25]:
print('Total samples sue to ambiguous labels', sum(idx))
print('rsp', rsp.shape)
rsp_new = rsp[ ~idx ]
print('rsp_new', rsp_new.shape)
print((rsp.shape[0] - rsp_new.shape[0])/rsp.shape[0]*100)

Total samples sue to ambiguous labels 40016
rsp (529940, 7)
rsp_new (489924, 7)
0.07551043514360116
7.551043514360116
