In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import pathlib
import argparse
import datetime
from time import time
from pprint import pprint

import sklearn
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold

In [2]:
SEED = None
test_size = 0.5
split_by = 'both'

In [3]:
def split_cell_drug(dff):
    """ Split drug and cell features. """
    dff = dff.copy()
    dd_cols = [c for c in df.columns if 'DD_' in c]
    ge_cols = [c for c in df.columns if 'GE_' in c]
    dd = dff[dd_cols]
    ge = dff[ge_cols]
    print('\ndd.shape', dd.shape)
    print('ge.shape', ge.shape)
    return dd, ge


def add_lbl_dup(dff, label_name='label', prffx='_'):
    """ Label unique rows. Add column indicating a unique row (label). """
    # Save the original row indexes in order to re-order rows after processing
    idx_org = dff.index.values
    
    # Sort rows (duplicated rows will be concateneted)
    dff = dff.sort_values(by=dff.columns.tolist())
    # Add boolean col indicating the start of new unique row
    dff = pd.concat([dff.duplicated(keep='first'), dff], axis=1).rename(columns={0: 'd'})

    # Add col indicating a unique row
    c = -1
    v = np.ones((len(dff),))
    for i, x in enumerate(dff['d']):
        # if i % 50000 == 0: print(i)
        if x is False:
            c += 1
            v[i] = int(c)
        else:
            v[i] = c

    dff.insert(loc=1, column=label_name, value=v) 
    dff = dff.reindex(idx_org)  # back to the original row ordering
    dff = dff.drop(columns=['d'])
    
    dff[label_name] = dff[label_name].map(lambda x: prffx + str(int(x)))
    return dff

In [4]:
df = pd.read_parquet('../uniq.top6.reg.parquet', engine='auto', columns=None)
df = df.sample(frac=1.0, axis=0, random_state=SEED).reset_index(drop=True) # shuffle values
col_idx = df.nunique(dropna=True).values == 1  # col indexes to drop
df = df.iloc[:, ~col_idx]

print(df.shape)

(283152, 3765)


In [5]:
dd, ge = split_cell_drug(dff=df)
dlb = add_lbl_dup(dd, label_name='dlb', prffx='d')['dlb']
clb = add_lbl_dup(ge, label_name='clb', prffx='c')['clb']
print(len(dlb.unique()))
print(len(clb.unique()))


dd.shape (283152, 2822)
ge.shape (283152, 942)
1748
748


In [None]:
# Determine split indices based on split method
if split_by == 'none':
    # Random split
    cv = ShuffleSplit(n_splits=2, test_size=test_size, random_state=0)
    id_grp1, id_grp2 = next(cv.split(df))
    
else:
    dd, ge = split_cell_drug(dff=df)
    dlb = add_lbl_dup(dd, label_name='dlb', prffx='d')['dlb']
    clb = add_lbl_dup(ge, label_name='clb', prffx='c')['clb']
    # print('Unique drugs:', len(dlb.unique()))
    # print('Unique cells:', len(clb.unique()))
    
    if split_by == 'cell':  # Strict split by cell
        cv = GroupShuffleSplit(n_splits=2, test_size=test_size, random_state=0)
        id_grp1, id_grp2 = next(cv.split(df, groups=clb))  # Split indexes
    
    elif split_by == 'drug':  # Strict split by drug
        cv = GroupShuffleSplit(n_splits=2, test_size=test_size, random_state=0)
        id_grp1, id_grp2 = next(cv.split(df, groups=dlb))  # Split indexes
    
    elif split_by == 'both':  # Strict split by both, cell and drug
        # TODO: integrate test_size into this type of split!
        
        # Create cross-tab table with cell and drugs
        # (the values indicate the number of data points for each [drug, cell] combination)
        ctb = pd.concat([clb, dlb], axis=1)
        ctb['one'] = 1
        ctb = pd.pivot_table(ctb, index='clb', columns='dlb', values='one', aggfunc='sum', fill_value=0)
        ctb.columns.name = None
        ctb.index.name = None
        # print(ctb.shape)
        # print(ctb.values.reshape(-1,).sum())
        
        # Shuffle both cells and drugs
        ctb = ctb.sample(frac=1.0, axis=0)
        ctb = ctb.sample(frac=1.0, axis=1)
        
        # Choose range and split data (disjoint sets in the cross-tab table)
        r_bot, r_top = range(0, round(ctb.shape[0]/2)), range(round(ctb.shape[0]/2), ctb.shape[0])
        c_bot, c_top = range(0, round(ctb.shape[1]/2)), range(round(ctb.shape[1]/2), ctb.shape[1])
        t1 = ctb.iloc[r_bot, c_bot]
        t2 = ctb.iloc[r_top, c_top]
        
        # Get cell and drug labels for each data partition
        c1, d1 = t1.index.values, t1.columns.values
        c2, d2 = t2.index.values, t2.columns.values
        
        # Split indexes
        id_grp1 = dlb.isin(d1) & clb.isin(c1)
        id_grp2 = dlb.isin(d2) & clb.isin(c2)

In [1]:
# Split
df = pd.concat([clb, dlb, df], axis=1)
df1 = df.loc[id_grp1, :]
df2 = df.loc[id_grp2, :]

NameError: name 'pd' is not defined

In [None]:
# Test cell and drug intersection between datasets
cell_intrsc = set(df1['clb']).intersection(set(df2['clb']))
drug_intrsc = set(df1['dlb']).intersection(set(df2['dlb']))
print('Cell intersection:', len(cell_intrsc))
print('Drug intersection:', len(drug_intrsc))

In [None]:
# Cols to retain
dd_cols = [c for c in df.columns if 'DD_' in c]
ge_cols = [c for c in df.columns if 'GE_' in c]
cols = ['AUC1'] + dd_cols + ge_cols

In [None]:
# Extract only relevant cols (features and target)
df1 = df1[cols].reset_index(drop=True)
df2 = df2[cols].reset_index(drop=True)