### purpose

create random sets of SNP files for 3 reps from each of 225 simulation seeds

### outline

1. [get simulation seeds](#seeds)

get a list of replicate IDs, to create subsets of the data (N = 500 loci, 5 000, 10 000, 20 000)

1. [select random sets of loci from each seed](#random)

randomly select N loci

In [1]:
from pythonimports import *

import MVP_summary_functions as mvp

t0 = dt.now()  # notebook timer

lview, dview = get_client(cluster_id='1705931649-lrg1', profile='lotterhos')

mvp.latest_commit()
session_info.show()

36 36
#########################################################
Today:	January 22, 2024 - 16:09:32 EST
python version: 3.8.5
conda env: mvp_env

Current commit of [1mpythonimports[0m:
[33mcommit 419895d157c97717f835390196c13cf973d25eba[m  
Merge: e20434f 1e09b6c  
Author: Brandon Lind <lind.brandon.m@gmail.com>

Current commit of [94m[1mMVP_offsets[0m[0m:
[33mcommit c5bc403582e5bafc1036be8cd2a4bb0b4d154623[m  
Author: Brandon Lind <lind.brandon.m@gmail.com>  
Date:   Mon Dec 18 14:38:19 2023 -0500
#########################################################



<a id ='seeds'></a>
# get seeds

get a list of SNP files representing 3 reps of each simulation level (225 simulation levels total)

[top](#home)

In [2]:
src_dirs = [
    '/work/lotterhos/MVP-Offsets/run_20220919_0-225/gradient_forests/training/training_files',
    '/work/lotterhos/MVP-Offsets/run_20220919_225-450/gradient_forests/training/training_files',
    '/work/lotterhos/MVP-Offsets/run_20220919_450-675/gradient_forests/training/training_files'
]

In [3]:
src_files = defaultdict(list)
for d in src_dirs:
    rep = op.basename(d.split('/gradient_fore')[0])
    src_files[rep] = fs(d, endswith='ind_all.txt')
    print(rep, len(src_files[rep]))

run_20220919_0-225 225
run_20220919_225-450 225
run_20220919_450-675 225


<a id='random'></a>
# select random sets of loci from each seed

[top](#home)

In [4]:
# create directories

outerdir = makedir('/work/lotterhos/brandon/ind_runtimes')

In [5]:
def subset_SNPs(rep, f):
    """From a file containing all loci, subset loci and create new file.
    
    Parameters
    ----------
    rep : str
        - the name for the set of replicates - eg run_20220919_0-225 or run_20220919_225-450
    f : file path
        - path to the GF-ready input file for MVP_gf_training_script.R containing all loci
        
    Returns
    -------
    rep : str
        same as input argument
    dsts : collections.defaultdict(list)
        key = set_num (number of random loci), value = list (of length 1) of file paths to new SNP file
    """
    from pythonimports import makedir
    from collections import defaultdict
    from os import path as op
    import pandas as pd
    import random
    
    df = pd.read_table(f, index_col='index')
    
    loci = df.columns.tolist()
    
    set_nums = ['00500', '05000', '10000', '20000']
    
    dsts = defaultdict(list)
    for set_num in set_nums:
        random.shuffle(loci)
        
        loci_df = df[
            loci[:int(set_num)]
        ].copy()
        
        loci_df['index'] = loci_df.index.tolist()
        
        dst_dir = makedir(
            f'/work/lotterhos/brandon/ind_runtimes/{rep}/{set_num}/gradient_forests/training/training_files'
        )
        
        basename = op.basename(f)
        dst = f'{dst_dir}/{basename}'
        
        loci_df.to_csv(dst, index=False, header=True, sep='\t')
        
        dsts[set_num].append(dst)
        
    return rep, dsts

In [6]:
jobs = []
for rep, files in src_files.items():
    for i, f in enumerate(files):
        jobs.append(
            lview.apply_async(
                subset_SNPs, *(rep, f)
            )
        )
        
watch_async(jobs)

[1m
Watching 675 jobs ...[0m


100%|███████████████| 675/675 [07:30<00:00,  1.50it/s]


In [7]:
dsts = wrap_defaultdict(list, 2)
for j in pbar(jobs):
    rep, dst_files = j.r
    for set_num, files in dst_files.items():
        dsts[rep][set_num].extend(files)

100%|███████████████| 675/675 [00:00<00:00, 46855.59it/s]


In [8]:
for rep in dsts:
    for set_num in dsts[rep]:
        print(rep, set_num, len(dsts[rep][set_num]))

run_20220919_0-225 00500 225
run_20220919_0-225 05000 225
run_20220919_0-225 10000 225
run_20220919_0-225 20000 225
run_20220919_225-450 00500 225
run_20220919_225-450 05000 225
run_20220919_225-450 10000 225
run_20220919_225-450 20000 225
run_20220919_450-675 00500 225
run_20220919_450-675 05000 225
run_20220919_450-675 10000 225
run_20220919_450-675 20000 225


In [9]:
formatclock(dt.now() - t0)

'0-00:07:32'

In [10]:
for (rep, set_num), files in unwrap_dictionary(dsts):
    for f in pbar(files):
        assert op.exists(f)
    
f

100%|███████████████| 225/225 [00:00<00:00, 1160.21it/s]
100%|███████████████| 225/225 [00:00<00:00, 1046.78it/s]
100%|███████████████| 225/225 [00:00<00:00, 955.15it/s] 
100%|███████████████| 225/225 [00:00<00:00, 789.08it/s]
100%|███████████████| 225/225 [00:00<00:00, 887.07it/s]
100%|███████████████| 225/225 [00:00<00:00, 906.58it/s]
100%|███████████████| 225/225 [00:00<00:00, 865.13it/s]
100%|███████████████| 225/225 [00:00<00:00, 1075.84it/s]
100%|███████████████| 225/225 [00:00<00:00, 1416.64it/s]
100%|███████████████| 225/225 [00:00<00:00, 1282.78it/s]
100%|███████████████| 225/225 [00:00<00:00, 1533.16it/s]
100%|███████████████| 225/225 [00:00<00:00, 1307.89it/s]


'/work/lotterhos/brandon/ind_runtimes/run_20220919_450-675/20000/gradient_forests/training/training_files/1231768_Rout_Gmat_sample_maf-gt-p01_GFready_ind_all.txt'