In [1]:
import sys
import os
import numpy as np
import pandas as pd
from qnorm import quantile_normalize

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import explained_variance_score


In [3]:
#Load targets dataframe

model_folders = [
    'models',
]

testg_suffixes = [
    '-0',
]

#Loop over model folders
for model_folder, testg_suffix in zip(model_folders, testg_suffixes) :

    print('-- model_folder = ' + str(model_folder) + ' --')

    targets_df = pd.read_csv(model_folder + '/f0c0/testg' + testg_suffix + '/acc.txt', index_col=0, sep='\t')[['identifier', 'description']]

    #Append metadata
    targets_df_metadata = pd.read_csv('data/hg38/targets.txt', sep='\t', index_col=0)[['identifier', 'source', 'file']].drop_duplicates().copy().reset_index(drop=True)
    metadata_dict = targets_df_metadata.set_index(['identifier']).to_dict(orient='index')

    sources = []
    files = []
    for _, row in targets_df.iterrows() :
        sources.append(metadata_dict[row['identifier']]['source'])
        files.append(metadata_dict[row['identifier']]['file'])

    targets_df['source'] = sources
    targets_df['file'] = files

    #Create unique group identifiers

    #Default: each stranded track is its own group
    targets_df['group_identifier'] = targets_df['identifier']

    #Create groups for half-life time courses (SRA project)
    #targets_df.loc[targets_df['source'] == 'recount3', 'group_identifier'] = targets_df.loc[targets_df['source'] == 'recount3']['file'].apply(lambda x: x.split("recount3/")[1].split("/")[0])

    #Create groups for half-life time courses (SRA project + Cell type)
    targets_df.loc[targets_df['source'] == 'recount3', 'group_identifier'] = targets_df.loc[targets_df['source'] == 'recount3'].apply(lambda row: row['file'].split("recount3/")[1].split("/")[0] + ':' + row['description'].replace('_', ' ').split(' ')[0].split(':')[-1], axis=1)

    #Filter on half-life experiments only (with more than 1 time-point)
    hl_set = sorted(list(set(targets_df.groupby(['group_identifier']).agg({'identifier' : 'count'}).reset_index().query('identifier > 1')['group_identifier'].values.tolist())))
    targets_df = targets_df.loc[(targets_df['source'] == 'recount3') & (targets_df['group_identifier'].isin(hl_set))].copy().reset_index(drop=True)

    cols = targets_df['identifier'].values.tolist()
    groups = targets_df['group_identifier'].unique().tolist()
    group_identifiers = targets_df['group_identifier'].values.tolist()

    group_dict = {}
    for group in groups :
        group_dict[group] = targets_df.query("group_identifier == '" + group + "'")['identifier'].values.tolist()

    print("len(targets_df) = " + str(len(targets_df)))

    #Read predictions and targets for each fold

    fold_index = [0, 1]
    cross_index = [0]

    cov_pred_dict = {}
    cov_true_dict = {}
    fold_dict = {}

    #Loop over folds
    for fold_ix in fold_index :

        print("Processing fold = " + str(fold_ix))

        #Load predictions
        df_pred = pd.read_csv(model_folder + "/f" + str(fold_ix) + "c0/testg" + testg_suffix + "/gene_preds.tsv", sep='\t')
        df_pred = df_pred.rename(columns={'Unnamed: 0' : 'gene_id'})

        #Load targets
        df_true = pd.read_csv(model_folder + "/f" + str(fold_ix) + "c0/testg" + testg_suffix + "/gene_targets.tsv", sep='\t')
        df_true = df_true.rename(columns={'Unnamed: 0' : 'gene_id'})

        df_pred_new = df_pred[['gene_id']].copy()
        df_true_new = df_true[['gene_id']].copy()

        #Loop over groups and calculate quantile-normalized specificity performances
        for group in groups :

            df_pred_g = df_pred[['gene_id'] + group_dict[group]].copy()
            df_true_g = df_true[['gene_id'] + group_dict[group]].copy()

            y_pred = np.array(df_pred_g.values[:, 1:], dtype='float32')
            y_true = np.array(df_true_g.values[:, 1:], dtype='float32')

            #Quantile-normalize and subtract mean
            y_pred_norm = quantile_normalize(y_pred, ncpus=2)
            y_pred_norm = y_pred_norm - y_pred_norm.mean(axis=-1, keepdims=True)
            y_true_norm = quantile_normalize(y_true, ncpus=2)
            y_true_norm = y_true_norm - y_true_norm.mean(axis=-1, keepdims=True)

            #Write results back to main dataframe
            for identifier_i, identifier in enumerate(group_dict[group]) :
                df_pred_new[identifier] = y_pred_norm[:, identifier_i]
                df_true_new[identifier] = y_true_norm[:, identifier_i]

                #De-fragment
                df_pred_new = df_pred_new.copy()
                df_true_new = df_true_new.copy()

        df_pred = df_pred_new
        df_true = df_true_new

        #Accuracy stats
        gene_preds = np.array(df_pred.values[:, 1:], dtype='float32')
        gene_targets = np.array(df_true.values[:, 1:], dtype='float32')

        acc_npearsonr = []
        acc_nspearmanr = []
        acc_nr2 = []
        for ti in range(len(cols)) :

            nr_ti = pearsonr(gene_targets[:, ti], gene_preds[:, ti])[0]
            acc_npearsonr.append(nr_ti)

            nrs_ti = spearmanr(gene_targets[:, ti], gene_preds[:, ti])[0]
            acc_nspearmanr.append(nrs_ti)

            nr2_ti = explained_variance_score(gene_targets[:, ti], gene_preds[:, ti])
            acc_nr2.append(nr2_ti)

        acc_df = pd.DataFrame(
            {
                "identifier": cols,
                "group_identifier": group_identifiers,
                "pearsonr_norm": acc_npearsonr,
                "spearmanr_norm": acc_nspearmanr,
                "r2_norm": acc_nr2,
                "description": targets_df.description,
            }
        )
        acc_df.to_csv(model_folder + '/f' + str(fold_ix) + 'c0/testg' + testg_suffix + '/acc_hl.txt', sep="\t")


-- model_folder = models --
len(targets_df) = 235
Processing fold = 0
Processing fold = 1
-- model_folder = models_ft --
len(targets_df) = 235
Processing fold = 0
Processing fold = 1
