In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

from consolidated_runs import run_simulations

from Pearson.pearson import Pearson

sys.path.append(os.getcwd())

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
import random

def sample_with_proportion(total, targets, proportion):
    total_sample = 10
    num_special = int(total_sample * proportion)
    special = np.random.choice(targets, num_special, replace=False)

    remaining = list(set(total) - set(targets))
    normal = np.random.choice(remaining, total_sample - num_special, replace=False)

    final = np.concatenate([special, normal])
    np.random.shuffle(final)
    return final

In [None]:
datasets = range(1, 4) # DS3
run_simulations(datasets,
            sergio=True,
            saucie=True, 
            scScope=True, 
            deepImpute=True, 
            magic=True, 
            genie=False,
            arboreto=False,
            pearson=False,
            roc=False,
            precision_recall_k=False
        )

In [5]:
imp_dir = os.path.join(os.getcwd(), 'imputations')
print(imp_dir)
for i in range(1, 4):
    load_dir = os.path.join(imp_dir, f'DS{i}')
    save_name = 'DS6_expr.npy'

    expr = np.load(os.path.join(load_dir, save_name))
    print(expr.shape)
    file_name = 'expr_shape.csv'
    if not os.path.exists(os.path.join(load_dir, file_name)):
        npfile = np.load(os.path.join(load_dir, save_name))
        shap = npfile.shape
        print(shap)
        df = pd.DataFrame([shap])
        df.to_csv(load_dir + '/' + file_name, index=False)

/Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations
(9, 100, 300)
(9, 400, 300)
(9, 1200, 300)


In [None]:
imp_dir = os.path.join(os.getcwd(), 'imputations')

def run_pearson_ranking(proportion):
    print("Running Pearson ranking tests with proportion", proportion)
    for i in range(1, 4):
        gt_file = None
        if i == 1:
            gt_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/gt_GRN.csv'
        elif i == 2:
            gt_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/gt_GRN.csv'
        elif i == 3:
            gt_file = 'SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/gt_GRN.csv'
        
        # For now, just load clean data, save as csv, and load it back in
        methods = ['Clean', 'Noisy', 'SAUCIE', 'scScope', 'DeepImpute', 'MAGIC']
        file_name = ''
        save_name = ''
        for method in methods:
            if method == 'Clean':
                file_name = f'DS6_clean.npy'
                save_name = f'DS6_clean.csv'
            elif method == 'Noisy':
                file_name = f'DS6_45.npy'
                save_name = f'DS6_45.csv'
            elif method == 'SAUCIE':
                file_name = f'yhat_SAUCIE.npy'
                save_name = f'yhat_SAUCIE.csv'
            elif method == 'scScope':
                file_name = f'yhat_scScope.npy'
                save_name = f'yhat_scScope.csv'
            elif method == 'DeepImpute':
                file_name = f'yhat_deepImpute.npy'
                save_name = f'yhat_deepImpute.csv'
            elif method == 'MAGIC':
                file_name = f'yhat_MAGIC_t_auto.npy'
                save_name = f'yhat_MAGIC_t_auto.csv'
            load_dir = os.path.join(imp_dir, f'DS{i}')
            if not os.path.exists(os.path.join(imp_dir, f'DS{i}', save_name)):
                npfile = np.load(load_dir + '/' + file_name)
                df = pd.DataFrame(npfile)
                df.to_csv(load_dir + '/' + save_name, index=False)
            clean_df = pd.read_csv(os.path.join(load_dir, save_name))
            
            # Read ground truth
            gt = pd.read_csv(gt_file, header=None)
            confirmed_genes = gt[0].unique()

            # Run Pearson on clean and noisy data
            pearson = Pearson(np.transpose(clean_df), '')
            p_values = pearson.values
            np.fill_diagonal(p_values, 0)
            pearson = pd.DataFrame(p_values, index=pearson.columns, columns=pearson.columns)

            total = 0
            for r in range(50):
                sampled = sample_with_proportion(clean_df.index.tolist(), confirmed_genes, proportion)
            # Take subset of pearson data with only the sampled genes
                p_subset = pearson.loc[sampled]
            #print(p_subset)
            # Rank the expression pairs by the absolute value of the pearson correlation
                melted = pd.melt(p_subset.reset_index(), id_vars=['index'], value_vars=p_subset.columns)
                melted.columns = ['input', 'target', 'correlation']
                melted_sorted = melted.sort_values(by='correlation', ascending=False)
                ranked_list = list(melted_sorted.itertuples(index=False, name=None))[:10]
            # Calculate the proportion of confirmed genes in the top k of the ranked pairs
                top_10 = [(input, target) for input, target, _ in ranked_list]
                true_pairs = set(tuple(x) for x in gt.values)
                matches = sum(1 for pair in top_10 if pair in true_pairs)
                prop = matches / len(top_10)
                total += prop

            print(f"DS{i}", method, total / 50)

In [48]:
run_pearson_ranking(0.1)
run_pearson_ranking(0.2)
run_pearson_ranking(0.3)

Running Pearson ranking tests with proportion 0.1
DS1 Clean 0.17999999999999997
DS1 Noisy 0.042
DS1 SAUCIE 0.0
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.18799999999999997
DS2 Noisy 0.002
DS2 SAUCIE 0.0
DS2 scScope 0.002
DS2 DeepImpute 0.002
DS2 MAGIC 0.0
DS3 Clean 0.16999999999999996
DS3 Noisy 0.0
DS3 SAUCIE 0.004
DS3 scScope 0.0
DS3 DeepImpute 0.002
DS3 MAGIC 0.0
Running Pearson ranking tests with proportion 0.2
DS1 Clean 0.374
DS1 Noisy 0.05199999999999999
DS1 SAUCIE 0.004
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.468
DS2 Noisy 0.006000000000000001
DS2 SAUCIE 0.0
DS2 scScope 0.0
DS2 DeepImpute 0.006000000000000001
DS2 MAGIC 0.0
DS3 Clean 0.31600000000000006
DS3 Noisy 0.0
DS3 SAUCIE 0.004
DS3 scScope 0.002
DS3 DeepImpute 0.002
DS3 MAGIC 0.002
Running Pearson ranking tests with proportion 0.3
DS1 Clean 0.57
DS1 Noisy 0.06
DS1 SAUCIE 0.002
DS1 scScope 0.0
DS1 DeepImpute 0.0
DS1 MAGIC 0.0
DS2 Clean 0.496
DS2 Noisy 0.02
DS2 SAUCIE 0.0
DS2 scScope 0.00

In [3]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import random

def run_pearson_edge_sampling(proportion):
    print("Running Pearson edge sampling tests with proportion", proportion)
    imp_dir = os.path.join(os.getcwd(), 'imputations')
    
    for i in range(1, 4):
        if i == 1:
            gt_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/gt_GRN.csv'
        elif i == 2:
            gt_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/gt_GRN.csv'
        elif i == 3:
            gt_file = 'SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/gt_GRN.csv'
        
        methods = ['Clean', 'Noisy', 'SAUCIE', 'scScope', 'DeepImpute', 'MAGIC']
        
        for method in methods:
            file_name = f'DS{i}_{method}.csv'  # Adjusted for simplicity
            load_dir = os.path.join(imp_dir, f'DS{i}')
            data_file_path = os.path.join(load_dir, file_name)

            if not os.path.exists(data_file_path):
                print(f"File {data_file_path} does not exist. Skipping.")
                continue
            
            clean_df = pd.read_csv(data_file_path)
            gt = pd.read_csv(gt_file, header=None)
            
            confirmed_edges = set(tuple(x) for x in gt.values)
            
            all_possible_edges = [(row, col) for row in clean_df.columns for col in clean_df.columns if row != col]
            num_edges_to_sample = int(len(all_possible_edges) * proportion)
            
            total = 0
            for r in range(50):
                sampled_edges = random.sample(all_possible_edges, num_edges_to_sample)
                matches = 0
                for edge in sampled_edges:
                    gene1, gene2 = edge
                    corr, _ = pearsonr(clean_df[gene1], clean_df[gene2])
                    # Check if this edge is in the confirmed edges and if the correlation is significant
                    if edge in confirmed_edges and abs(corr) > 0.5:  # Threshold for "significant" correlation can be adjusted
                        matches += 1
                prop = matches / len(sampled_edges)
                total += prop
            
            print(f"DS{i} {method}: Proportion of significant confirmed edges = {total / 50}")

# Example usage
proportion = 0.1
run_pearson_edge_sampling(proportion)


Running Pearson edge sampling tests with proportion 0.1
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_Clean.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_Noisy.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_SAUCIE.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_scScope.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_DeepImpute.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS1/DS1_MAGIC.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS2/DS2_Clean.csv does not exist. Skipping.
File /Users/joshuaweiner/Desktop/Folders/Projects/zero_imputation/imputations/DS2/

In [5]:
from parallel_utils import process_iteration
import concurrent
from concurrent.futures import ProcessPoolExecutor

def edge_finding_experiment():
    print("Running edge finding experiment")
    for i in range(2, 4):
        gt_file = None
        if i == 1:
            gt_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/gt_GRN.csv'
            target_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
            regs_path = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Regs_cID_4.txt'
        elif i == 2:
            gt_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/gt_GRN.csv'
            target_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Interaction_cID_5.txt'
            regs_path = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Regs_cID_5.txt'
        elif i == 3:
            gt_file = 'SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/gt_GRN.csv'
            target_file = './SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Interaction_cID_6.txt'
            regs_path = './SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Regs_cID_6.txt'
        
        reg_df = pd.read_csv(regs_path, header=None)
        master_regs = [int(m) for m in reg_df[0].values]
        true_pearson = pd.DataFrame()
        gt = pd.read_csv(gt_file, header=None)
        imp_dir = os.path.join(os.getcwd(), 'imputations')
        load_dir = os.path.join(imp_dir, f'DS{i}')
        ranks = []
        correlations = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            futures = []
            for iteration in tqdm(range(0, 51)):
                file_extension = ''
                if iteration == 0:
                    #print('unchanged version')
                    #run_sergio(target_file, regs_path, i, file_extension)
                    clean_df = pd.DataFrame(np.load(os.path.join(load_dir, f"DS6_clean{file_extension}.npy")))
                    true_pearson = Pearson(np.transpose(clean_df), '')
                    p_values = true_pearson.values
                    np.fill_diagonal(p_values, 0)
                    true_pearson = pd.DataFrame(p_values, index=true_pearson.columns, columns=true_pearson.columns)
                else:
                    file_extension = f'_iter{iteration}'
                    futures.append(executor.submit(process_iteration, iteration, target_file, regs_path, master_regs, load_dir, imp_dir, i, file_extension))
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
                pearson, chosen_pair, temp_target, f_ext = future.result()
                regulator_row = pearson.loc[chosen_pair[0]]
                true_regulator_row = true_pearson.loc[chosen_pair[0]]
                
                # Take difference between calculated and true values
                abs_diff = np.abs(true_regulator_row - regulator_row)
                sorted_row = abs_diff.sort_values(ascending=False)

                #print(abs_diff.index(chosen_pair[1]), sorted_row.index(chosen_pair[1]))
                rank_target = sorted_row.index.get_loc(chosen_pair[1])
                ranks.append(rank_target)
                rank_value = sorted_row.iloc[rank_target]
                correlations.append(rank_value)

                        #rank_value = sorted_row.iloc[rank_target]
                        
                print(chosen_pair[0], chosen_pair[1], rank_target, rank_value)
                print("Current iteration:", iteration, f"Mean rankings for DS{i} added edge:", np.mean(ranks), np.mean(correlations))
                        # delete temp file
                os.remove(temp_target)
                os.remove(os.path.join(imp_dir, f'DS{i}', f"DS6_clean{f_ext}.npy"))
                os.remove(os.path.join(imp_dir, f'DS{i}', f"DS6_clean_counts{f_ext}.npy"))
                os.remove(os.path.join(imp_dir, f'DS{i}', f"DS6_noisy{f_ext}.npy"))
                os.remove(os.path.join(imp_dir, f'DS{i}', f"DS6_expr{f_ext}.npy"))
                        #print(chosen_pair[0], regulator_row)
                        #print(chosen_pair[0], regulator_row.sort_values(ascending=False))
            print(f"Mean rankings for DS{iteration} added edge:", np.mean(ranks), np.mean(correlations))               


In [None]:
edge_finding_experiment()