In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os, sys

import parallel_utils

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import random
from time import time
import concurrent
from concurrent.futures import ProcessPoolExecutor
from consolidated_runs import run_sergio

def new_expected_value_experiment(dataset_id, last_iteration=0, fixed_edge=False, add_edge=True, multiple_edges=False, clean='clean'):
    #print("Running edge finding experiment")
    gt_file = None
    if dataset_id == 1:
        gt_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/gt_GRN.csv'
        target_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
        regs_path = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Regs_cID_4.txt'
        n_genes = 100
    elif dataset_id == 2:
        gt_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/gt_GRN.csv'
        target_file = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Interaction_cID_5.txt'
        regs_path = './SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Regs_cID_5.txt'
        n_genes = 400
    elif dataset_id == 3:
        gt_file = 'SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/gt_GRN.csv'
        target_file = './SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Interaction_cID_6.txt'
        regs_path = './SERGIO/data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Regs_cID_6.txt'
        n_genes = 1200
    reg_df = pd.read_csv(regs_path, header=None)
    master_regs = [int(m) for m in reg_df[0].values]
    true_pearson = pd.DataFrame()
    gt = pd.read_csv(gt_file, header=None)
    imp_dir = os.path.join(os.getcwd(), 'imputations')
    load_dir = os.path.join(imp_dir, f'DS{dataset_id}')
    ranks = []
    means = []

    experiment_dir = os.path.join(os.getcwd(), f'experiments/mean_diffs/DS{dataset_id}')
    if not os.path.exists(experiment_dir):
        os.makedirs(experiment_dir)
    experiment_file = os.path.join(experiment_dir, f"DS{dataset_id}_mean_diff_experiment.csv")
    if fixed_edge:
        experiment_file = os.path.join(experiment_dir, f"DS{dataset_id}_mean_diff_experiment_fixed_edge.csv")
    if os.path.exists(experiment_file) and last_iteration != 0:
        df = pd.read_csv(experiment_file)
        ranks = df['Rank'].values.tolist()
        means = df['Correlation'].values.tolist()

    #run_sergio(target_file, regs_path, dataset_id, file_extension='')

    with ProcessPoolExecutor(max_workers=4) as executor:
        futures = []
        for iteration in range(last_iteration, 51):      
            file_extension = ''                          
            file_extension = f"_iter{iteration}"
            # Get number of genes to choose a target
            if iteration != 0:
                futures.append(executor.submit(parallel_utils.new_mean_process_iteration, iteration, target_file, regs_path, master_regs, load_dir, add_edge, multiple_edges, imp_dir, dataset_id, file_extension, clean))
        
        iter = 0
        chosen_pairs = []
        chosen_pair_filename = os.path.join(experiment_dir, f"DS{dataset_id}_chosen_pairs.csv")
        expression_data = {}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            outlier_data, clean_data = future.result()
            expression_data['outlier'] = outlier_data
            expression_data['clean'] = clean_data
            # ranks, final_ranks, temp_target, f_ext, ds_iter = future.result()
            # #print(chosen_pair, rank)
            # experiment_file_ranks = experiment_file.replace('.csv', f'_iter{ds_iter}.csv')
            # ranks.to_csv(experiment_file_ranks, index=False)
            # #print(final_ranks)
            # for tup in final_ranks:
            #     chosen_pair = (tup[0], tup[1])
            #     add = tup[2]
            #     rank = ranks.index.get_loc(chosen_pair[1]) + 1
            #     if not add:
            #         rank = n_genes - rank
            #     # print(chosen_pair[0], chosen_pair[1], rank, add)
            #     target_val = tup[4]
            #     target_val = ranks.loc[chosen_pair[1]]['mean']
            #     overall_mean = np.mean(ranks['mean'])
            #     chosen_pairs.append([chosen_pair[0], chosen_pair[1], rank, target_val, overall_mean, add, ds_iter])
        #     if ds_iter % 10 != 0:
        #         #print(temp_target)
        #         os.remove(temp_target)
        #         os.remove(os.path.join(experiment_dir, f"DS{dataset_id}_mean_diff_experiment_iter{ds_iter}.csv"))
        #         os.remove(os.path.join(imp_dir, f'DS{dataset_id}', f"DS6_clean{f_ext}.npy"))
        #         os.remove(os.path.join(imp_dir, f'DS{dataset_id}', f"DS6_noisy{f_ext}.npy"))
        #     os.remove(os.path.join(imp_dir, f'DS{dataset_id}', f"DS6_expr{f_ext}.npy"))
        #     os.remove(os.path.join(imp_dir, f'DS{dataset_id}', f"DS6_clean_counts{f_ext}.npy"))
        #     iter += 1
        # chosen_pair_df = pd.DataFrame(chosen_pairs)
        # chosen_pair_df.to_csv(chosen_pair_filename, index=False)
        return expression_data

In [15]:
expr_data = new_expected_value_experiment(dataset_id=1, last_iteration=50, fixed_edge=False, add_edge=True, multiple_edges=False, clean='noisy')


Instructions for updating:
non-resource variables are not supported in the long term
100%|██████████| 1/1 [00:59<00:00, 59.12s/it]

[array([[ 7.84347711,  5.78539982,  5.80987028, ...,  9.96206991,
         2.93807874,  2.78190142],
       [ 9.72084081,  7.10428275,  7.59982298, ...,  9.11452937,
         6.88983018,  3.24929217],
       [ 3.0376921 ,  8.26604346,  7.93135829, ...,  6.80055262,
         5.91573095,  0.14497627],
       ...,
       [ 1.93831437,  3.04892539,  3.24838684, ...,  3.25929844,
         4.66533143,  5.07182495],
       [ 7.41574606, 10.24888311, 10.99711575, ..., 10.22679923,
        11.47417608,  5.31580104],
       [ 0.52933615,  1.12641495,  3.70405288, ...,  6.00343536,
         5.48598163,  6.38136609]]), array([[ 3.06307467,  0.93689649,  3.55288582, ...,  4.36567648,
         2.20811767,  0.57059189],
       [ 3.12200844,  5.08498604,  3.96626846, ...,  3.69881905,
         1.7007656 ,  6.09078086],
       [ 2.04053385,  1.38981799,  0.35962964, ...,  0.31887798,
         3.17280706,  2.55038907],
       ...,
       [ 4.68449581,  2.09981183,  2.4011877 , ...,  3.45229956,
        


