In [7]:
#!/home/was966/micromamba/envs/responder/bin/python
#sbatch --mem 64G -c 12 -t 100:00:00 -p priority  ./ctct_run_all.py

import os
from tqdm import tqdm
from itertools import chain
import pandas as pd
import numpy as np
import random, torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'white', font_scale=1.3)
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


import sys
sys.path.insert(0, '/home/was966/Research/mims-conceptor/')
from baseline.immnue_score import immnue_score_methods
from conceptor.utils import plot_embed_with_label,plot_performance, score, score2


def onehot(S):
    assert type(S) == pd.Series, 'Input type should be pd.Series'
    dfd = pd.get_dummies(S, dummy_na=True)
    nanidx = dfd[dfd[np.nan].astype(bool)].index
    dfd.loc[nanidx, :] = np.nan
    dfd = dfd.drop(columns=[np.nan])*1.
    cols = dfd.sum().sort_values(ascending=False).index.tolist()
    dfd = dfd[cols]
    return dfd

In [8]:
import sys
sys.path.insert(0, '/home/was966/Research/mims-conceptor/')
from baseline.immnue_score import immnue_score_methods
from conceptor.utils import plot_embed_with_label,plot_performance, score

data_path = '../../../../paper/00_data/'
df_label = pd.read_pickle(os.path.join(data_path, 'ITRP.PATIENT.TABLE'))
df_tpm = pd.read_pickle(os.path.join(data_path, 'ITRP.TPM.TABLE'))
df_tpm.shape, df_label.shape

df_task = onehot(df_label.response_label)
size = df_label.groupby('cohort').size()
size = size.index + "\n(n = " + size.astype(str) + ")"
cohorts = df_label.groupby('cohort').size().sort_values().index.tolist()
#cohorts

In [9]:
df_c2c = pd.read_csv('../stratification/cancer2cancer.csv', index_col=0)
df_d2d = pd.read_csv('../stratification/drug2drug.csv', index_col=0)
df_s2s = pd.read_csv('../stratification/sequencer2sequencer.csv', index_col=0)

df_label = pd.concat([df_c2c, df_d2d, df_s2s])
cohorts = df_label['stratified_cohort'].unique().tolist()
cohort_rgc = df_label[['stratified_cohort', 'stratified_cohort_rgc']].drop_duplicates().set_index('stratified_cohort').stratified_cohort_rgc.to_dict()

In [10]:
cohort_target_map = df_label[['stratified_cohort', 'ICI_target_map']].drop_duplicates().set_index('stratified_cohort').ICI_target_map.to_dict()
cohort_cancer_map = df_label[['stratified_cohort', 'cancer_type']].drop_duplicates().set_index('stratified_cohort').cancer_type.to_dict()

def cohort_to_cohort(cohorts):
    # Create a list of lists, each missing one element from the original list
    return [(cohorts[i], cohorts[:i] + cohorts[i+1:]) for i in range(len(cohorts))]
# train_test_cohorts = cohort_to_cohort(cohorts)

In [11]:
transfer_groups = df_label.groupby(['domain', 'group'])['stratified_cohort'].unique().apply(lambda x:x.tolist())

train_test_cohorts = []
domains = []
groups = []
for (domain, group), transfer_cohots in transfer_groups.items():
    transfer_pairs = cohort_to_cohort(transfer_cohots)
    for pair in transfer_pairs:
        domains.append(domain)
        groups.append(group)
        train_test_cohorts.append(pair)

In [None]:
for mode in immnue_score_methods.keys():

    print('Evaluation on Model %s' % mode)
    
    work_dir = './F2F/F2F_%s' % (mode)
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)

    res = []
    for (train_cohort, test_cohorts), domain, group in zip(train_test_cohorts, domains, groups):
        
        ## Get data for this cohort
        cohort_idx = df_label[df_label['stratified_cohort'] == train_cohort].index
        cohort_X = df_tpm.loc[cohort_idx]
        cohort_y = df_task.loc[cohort_idx]
        
        ## Get features for specific method
        cohort_target = cohort_target_map[train_cohort]
        cohort_cancer_type = cohort_cancer_map[train_cohort]
        Extractor = immnue_score_methods[mode]
        E = Extractor(cancer_type=cohort_cancer_type, drug_target=cohort_target)
        cohort_dfx = E(cohort_X)
        cohort_dfy = cohort_y['R']
    
        data_scaler = StandardScaler()
        train_X = data_scaler.fit_transform(cohort_dfx)
        train_y = cohort_dfy.values
        
        param_grid = {'penalty':['l2'], 'max_iter':[int(1e10)], 'solver':['lbfgs'],
                      'C':np.arange(0.1, 1, 0.1), 'class_weight':['balanced'] }
        model = LogisticRegression()
        
        gcv = GridSearchCV(model, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1).fit(train_X, train_y)
        best_C = gcv.best_params_['C']
    
        for test_cohort in test_cohorts:
            test_cohort_idx = df_label[df_label['stratified_cohort'] == test_cohort].index
            test_cohort_X = df_tpm.loc[test_cohort_idx]
            test_cohort_y = df_task.loc[test_cohort_idx]
            
            test_cohort_dfx = E(test_cohort_X)
            test_cohort_dfy = test_cohort_y['R']
            test_X = data_scaler.transform(test_cohort_dfx)
            
            pred_prob = gcv.best_estimator_.predict_proba(test_X)
            
            pred_testy = pd.DataFrame(pred_prob, index = test_cohort_dfy.index)

            pred_testy['domain'] = domain
            pred_testy['group'] = group
            pred_testy['train_cohort'] = train_cohort
            pred_testy['test_cohort'] = test_cohort    
            pred_testy['test_cohort_rgc'] = cohort_rgc[test_cohort]
            
            pred_testy['best_C'] = best_C
            pred_testy['mode'] = mode
            
            dfp = test_cohort_y.join(pred_testy)
    
            y_true, y_prob, y_pred = dfp['R'], dfp[1], dfp[[0, 1]].idxmax(axis=1)
            fig = plot_performance(y_true, y_prob, y_pred)
            fig.suptitle('train: %s, test: %s' % (train_cohort, test_cohort), fontsize=16)
            fig.savefig(os.path.join(work_dir, 'CTCT_train_%s_test_%s.jpg' % (train_cohort.replace('/', ':'), test_cohort.replace('/', ':'))))
            res.append(dfp)
    
    dfs = pd.concat(res)
    dfp = dfs.groupby(['domain','group','train_cohort', 'test_cohort', 'mode', 'best_C', 'test_cohort_rgc']).apply(lambda x:score2(x['R'], x[1], x[[0, 1]].idxmax(axis=1)))
    
    #roc, prc, f1, acc, mcc
    dfp = dfp.apply(pd.Series)
    dfp.columns = ['ROC', 'PRC', 'F1', 'ACC', 'MCC']
    dfp = dfp.reset_index()

    dfs.to_csv(os.path.join(work_dir, 'source_performance.tsv'), sep='\t')
    dfp.to_csv(os.path.join(work_dir, 'metric_performance.tsv'), sep='\t')