In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

from consolidated_runs import run_simulations

from Pearson.pearson import Pearson

sys.path.append(os.getcwd())

In [5]:
import random

def sample_with_proportion(total, targets, proportion):
    total_sample = 10
    num_special = int(total_sample * proportion)
    special = np.random.choice(targets, num_special, replace=False)

    remaining = list(set(total) - set(targets))
    normal = np.random.choice(remaining, total_sample - num_special, replace=False)

    final = np.concatenate([special, normal])
    np.random.shuffle(final)
    return final

In [None]:
datasets = range(1, 4) # DS3
run_simulations(datasets,
            sergio=True,
            saucie=True, 
            scScope=True, 
            deepImpute=True, 
            magic=True, 
            genie=False,
            arboreto=False,
            pearson=False,
            roc=False,
            precision_recall_k=False
        )

In [19]:
imp_dir = os.path.join(os.getcwd(), 'imputations')

def run_pearson_ranking(proportion):
    print("Running Pearson ranking tests with proportion", proportion)
    for i in range(1, 4):
        gt_file = None
        if i == 1:
            gt_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/gt_GRN.csv'
        elif i == 2:
            gt_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/gt_GRN.csv'
        elif i == 3:
            gt_file = 'SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/gt_GRN.csv'
        
        # For now, just load clean data, save as csv, and load it back in
        methods = ['Clean', 'Noisy', 'SAUCIE', 'scScope', 'DeepImpute', 'MAGIC']
        file_name = ''
        save_name = ''
        for method in methods:
            if method == 'Clean':
                file_name = f'DS6_clean.npy'
                save_name = f'DS6_clean.csv'
            elif method == 'Noisy':
                file_name = f'DS6_45.npy'
                save_name = f'DS6_45.csv'
            elif method == 'SAUCIE':
                file_name = f'yhat_SAUCIE.npy'
                save_name = f'yhat_SAUCIE.csv'
            elif method == 'scScope':
                file_name = f'yhat_scScope.npy'
                save_name = f'yhat_scScope.csv'
            elif method == 'DeepImpute':
                file_name = f'yhat_deepImpute.npy'
                save_name = f'yhat_deepImpute.csv'
            elif method == 'MAGIC':
                file_name = f'yhat_MAGIC_t_auto.npy'
                save_name = f'yhat_MAGIC_t_auto.csv'
            load_dir = os.path.join(imp_dir, f'DS{i}')
            if not os.path.exists(os.path.join(imp_dir, f'DS{i}', save_name)):
                npfile = np.load(load_dir + '/' + file_name)
                df = pd.DataFrame(npfile)
                df.to_csv(load_dir + '/' + save_name, index=False)
            clean_df = pd.read_csv(os.path.join(load_dir, save_name))
            
            # Read ground truth
            gt = pd.read_csv(gt_file, header=None)
            confirmed_genes = gt[0].unique()

            # Run Pearson on clean and noisy data
            pearson = Pearson(np.transpose(clean_df), '')
            p_values = pearson.values
            np.fill_diagonal(p_values, 0)
            pearson = pd.DataFrame(p_values, index=pearson.columns, columns=pearson.columns)

            total = 0
            for r in range(50):
                sampled = sample_with_proportion(clean_df.index.tolist(), confirmed_genes, proportion)
            # Take subset of pearson data with only the sampled genes
                p_subset = pearson.loc[sampled]
            #print(p_subset)
            # Rank the expression pairs by the absolute value of the pearson correlation
                melted = pd.melt(p_subset.reset_index(), id_vars=['index'], value_vars=p_subset.columns)
                melted.columns = ['input', 'target', 'correlation']
                melted_sorted = melted.sort_values(by='correlation', ascending=False)
                ranked_list = list(melted_sorted.itertuples(index=False, name=None))[:10]
            # Calculate the proportion of confirmed genes in the top k of the ranked pairs
                top_10 = [(input, target) for input, target, _ in ranked_list]
                true_pairs = set(tuple(x) for x in gt.values)
                matches = sum(1 for pair in top_10 if pair in true_pairs)
                prop = matches / len(top_10)
                total += prop

            print(f"DS{i}", method, total / 50)

In [20]:
run_pearson_ranking(0.1)
run_pearson_ranking(0.2)
run_pearson_ranking(0.3)

Running Pearson ranking tests with proportion 0.1
DS1 Clean 0.196
DS1 Noisy 0.004
DS1 SAUCIE 0.002
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.308
DS2 Noisy 0.0
DS2 SAUCIE 0.0
DS2 scScope 0.0
DS2 DeepImpute 0.0
DS2 MAGIC 0.0
DS3 Clean 0.14799999999999996
DS3 Noisy 0.004
DS3 SAUCIE 0.0
DS3 scScope 0.0
DS3 DeepImpute 0.0
DS3 MAGIC 0.002
Running Pearson ranking tests with proportion 0.2
DS1 Clean 0.44000000000000017
DS1 Noisy 0.04
DS1 SAUCIE 0.004
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.4000000000000001
DS2 Noisy 0.002
DS2 SAUCIE 0.0
DS2 scScope 0.002
DS2 DeepImpute 0.0
DS2 MAGIC 0.0
DS3 Clean 0.3459999999999999
DS3 Noisy 0.004
DS3 SAUCIE 0.0
DS3 scScope 0.0
DS3 DeepImpute 0.006000000000000001
DS3 MAGIC 0.0
Running Pearson ranking tests with proportion 0.3
DS1 Clean 0.4660000000000001
DS1 Noisy 0.09
DS1 SAUCIE 0.006000000000000001
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.41600000000000004
DS2 Noisy 0.006000000000000001
DS2 SAUCIE 0