this notebook best viewed here: https://nbviewer.jupyter.org/github/brandonlind/BURT/blob/master/002_trim_and_map.ipynb

In [1]:
from pythonimports import *

def qsub(shfiles:list, sleep=0, printing=False) -> list:
    """From a list of .sh shfiles, sbatch them and return associated jobid in a list."""
    
    if isinstance(shfiles, list) is False:
        assert isinstance(shfiles, str)
        shfiles = [shfiles]
    pids = []
    failcount = 0
    for sh in nb(shfiles):
        os.chdir(os.path.dirname(sh))
        try:
            pid = subprocess.check_output([shutil.which('qsub'), sh]).decode('utf-8').replace("\n", "").split()[2]
        except subprocess.CalledProcessError as e:
            failcount += 1
            if failcount == 10:
                print('!!!REACHED FAILCOUNT LIMIT OF 10!!!')
                return pids
            continue
        if printing is True:
            print('sbatched %s' % sh)
        pids.append(pid)
        time.sleep(sleep)
    return pids

In [2]:
DIR = op.realpath('/home/lindb/eckertlab/BURT')
DIR

'/gpfs_fs/home/eckertlab/BURT'

In [3]:
tandmdir = makedir(op.join(DIR, '002_trim_and_map'))

In [4]:
tandmdir

'/gpfs_fs/home/eckertlab/BURT/002_trim_and_map'

# trim and map to de novo assembly

- dDocent was used to create an de novo assembly from trimmed reads
- dDocent documentation refers to trimmomatic, but uses fastp. So I'll re-trim and map. I can also handle read groups during mapping this way, dDocent does not account for different read groups across samples.

#### create datatable.txt

- datatable.txt is used as input to start the varscan_pipeline - github.com/brandonlind/varscan_pipeline
- I am using the first few steps of this pipeline to trim, map, and mark duplicates __only__
- I've updated the pipeline to handle SGE scheduler (it was slurm-only before)

In [5]:
denovodir = makedir(op.join(tandmdir, 'denovo_mapping'))

In [6]:
fqs = pd.read_table('/gpfs_fs/home/eckertlab/BURT/seq/burt_fastq_data.txt')
fqs.head()

Unnamed: 0,fastq,sample_name,species,state,ind,popn,spp_popn,run,lib,seq_center,md5,RGPL,RGSM,RGLB,RGID,flowcell_lane,RGPU,num_reads
0,/home/cfriedline/eckertlab/projects/burt/seq/1...,T-AR-2-7,T,AR,7,T-AR-2,AR-2,160520,Burt0,NARF,1ae81d40ba5b1497e7ba9f5733c494b4,ILLUMINA,T-AR-2-7,Burt0,T-AR-2-7.160520.Burt0,C7WRGACXX.1,C7WRGACXX.1.T-AR-2-7,1277586
1,/home/cfriedline/eckertlab/projects/burt/seq/1...,T-TX-1-19,T,TX,19,T-TX-1,TX-1,160520,Burt0,NARF,eb991e8800c31b632dfc0f4279e9e44a,ILLUMINA,T-TX-1-19,Burt0,T-TX-1-19.160520.Burt0,C7WRGACXX.1,C7WRGACXX.1.T-TX-1-19,2798860
2,/home/cfriedline/eckertlab/projects/burt/seq/1...,G-SC-1-3,G,SC,3,G-SC-1,SC-1,160520,Burt0,NARF,e5cf36460b50c6467a49f4b157bb16c1,ILLUMINA,G-SC-1-3,Burt0,G-SC-1-3.160520.Burt0,C7WRGACXX.1,C7WRGACXX.1.G-SC-1-3,2312577
3,/home/cfriedline/eckertlab/projects/burt/seq/1...,P-LA-2-10,P,LA,10,P-LA-2,LA-2,160520,Burt0,NARF,a1fab3ce615d708f53eb836f13fbba7c,ILLUMINA,P-LA-2-10,Burt0,P-LA-2-10.160520.Burt0,C7WRGACXX.1,C7WRGACXX.1.P-LA-2-10,1583884
4,/home/cfriedline/eckertlab/projects/burt/seq/1...,T-AL-1-2,T,AL,2,T-AL-1,AL-1,160520,Burt0,NARF,16efa4d02874e6f5cf747e40bdbfb0dd,ILLUMINA,T-AL-1-2,Burt0,T-AL-1-2.160520.Burt0,C7WRGACXX.1,C7WRGACXX.1.T-AL-1-2,1321261


In [7]:
fqs['fastq'].tolist()

['/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/T-AR-2-7.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/T-TX-1-19.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/G-SC-1-3.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-LA-2-10.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/T-AL-1-2.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-LA-1-7.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-AL-2-5.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-LA-1-2.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/T-TX-1-21.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-GA-2-1.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/P-GA-2-12.R1.fastq.gz',
 '/home/cfriedline/eckertlab/projects/burt/seq/160520/Burt0/E-FL-1-14.R1.fastq.gz',

In [8]:
# get unique basenames names for all fastqs
bcounts = Counter()
bnames = {}
for fastq in fqs['fastq']:
    bname = op.basename(fastq).replace(".R1.fastq.gz", "_R1.fastq.gz")
    bcounts[bname] += 1
    if not bname in list(bnames.values()):
        bnames[fastq] = bname
    else:
        bname = bname.replace("R1.fastq.gz", "%s_R1.fastq.gz" % bcounts[bname])
    bnames[fastq] = bname
len(bnames), luni(bnames.values())

(1709, 1709)

In [9]:
# read in datatable.txt template, create blank template
dt = pd.read_table('/home/lindb/g/varscan_pipeline/datatable.txt')
dt = pd.DataFrame(columns=dt.columns)
display(dt)  # display blank df to show col names

for line in nb(fqs.index):
    row = nrow(dt)
    dt.loc[row, 'sample_name'] = fqs.loc[line, 'sample_name']
    dt.loc[row, 'library_name'] = fqs.loc[line, 'lib']
    dt.loc[row, 'file_name_r1'] = bnames[fqs.loc[line, 'fastq']]
    dt.loc[row, 'rgid'] = fqs.loc[line, 'RGID']
    dt.loc[row, 'rglb'] = fqs.loc[line, 'RGLB']
    dt.loc[row, 'rgpu'] = fqs.loc[line, 'RGPU']
    dt.loc[row, 'rgsm'] = dt.loc[row, 'sample_name']
dt['pool_name'] = 'BURT'
dt['ploidy'] = 2
dt['file_name_r2'] = np.nan
dt['adaptor_1'] = np.nan
dt['adaptor_2'] = np.nan
dt['ref'] = '/home/lindb/eckertlab/BURT/refs/dDocent_denovo_assembly/reference_pseudo.fasta'
dt['rgpl'] = 'ILLUMINA'
dt.head()

Unnamed: 0,sample_name,library_name,pool_name,ploidy,file_name_r1,file_name_r2,adaptor_1,adaptor_2,ref,rgid,rglb,rgpl,rgpu,rgsm


100%|██████████| 1709/1709 [00:17<00:00, 95.75it/s]


Unnamed: 0,sample_name,library_name,pool_name,ploidy,file_name_r1,file_name_r2,adaptor_1,adaptor_2,ref,rgid,rglb,rgpl,rgpu,rgsm
0,T-AR-2-7,Burt0,BURT,2,T-AR-2-7_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-AR-2-7.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-AR-2-7,T-AR-2-7
1,T-TX-1-19,Burt0,BURT,2,T-TX-1-19_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-TX-1-19.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-TX-1-19,T-TX-1-19
2,G-SC-1-3,Burt0,BURT,2,G-SC-1-3_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,G-SC-1-3.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.G-SC-1-3,G-SC-1-3
3,P-LA-2-10,Burt0,BURT,2,P-LA-2-10_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,P-LA-2-10.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.P-LA-2-10,P-LA-2-10
4,T-AL-1-2,Burt0,BURT,2,T-AL-1-2_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-AL-1-2.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-AL-1-2,T-AL-1-2


In [10]:
# look at an example with multiple files
dt[dt['sample_name']=='G-NC-1-4']

Unnamed: 0,sample_name,library_name,pool_name,ploidy,file_name_r1,file_name_r2,adaptor_1,adaptor_2,ref,rgid,rglb,rgpl,rgpu,rgsm
613,G-NC-1-4,Burt6,BURT,2,G-NC-1-4_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,G-NC-1-4.160520.Burt6,Burt6,ILLUMINA,C7WRGACXX.7.G-NC-1-4,G-NC-1-4
1093,G-NC-1-4,Burt6,BURT,2,G-NC-1-4_2_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,G-NC-1-4.160525.Burt6,Burt6,ILLUMINA,C7WP4ACXX.7.G-NC-1-4,G-NC-1-4
1694,G-NC-1-4,BURT_13,BURT,2,G-NC-1-4_3_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,G-NC-1-4.Novogene1.BURT_13,BURT_13,ILLUMINA,HKFKHBBXX.5.G-NC-1-4,G-NC-1-4


In [11]:
denovodir

'/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping'

In [12]:
dt.to_csv(op.join(denovodir, 'datatable.txt'), sep='\t', index=False)

In [13]:
# make symlinks to parentdir
for src in nb(fqs['fastq']):
    dst = op.join(denovodir, bnames[src])
    os.symlink(src, dst)

100%|██████████| 1709/1709 [00:00<00:00, 689767.66it/s]


In [None]:
# run 00_start but comment out the qsub command in 01_trim.py

In [14]:
pklload(op.join(denovodir, 'pipeline_start_command.pkl'))

Namespace(email='lindb@vcu.edu', email_options=['fail'], maf=None, paralogs=False, parentdir='/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping', repeats=False, sge=True, translate=True)

# chain up the .sh files

godel only allows 200 job submissions, so I'm going to create chains of jobs so that once one job finishes mapping it submits a trim job.

#### first run each python script to create the sh files for the next step (through marking duplicates)

I went into the pipeline and commented out the part where it submits the jobs so that I could run each python script

In [16]:
# get the trimming shfiles to create the mapping shfiles
shdir = op.join(denovodir, 'BURT/shfiles/01_trimmed_shfiles')
trimshfiles = fs(shdir, endswith='.sh')
len(trimshfiles)

1135

In [49]:
# write python cmds to a file, cat this file to GNU parallel in terminal window
catfile = op.join(denovodir, 'BURT/01_catfile.txt')
with open(catfile, 'w') as o:
    cmds = []
    for sh in trimshfiles:
        text = read(sh)
        cmds.append(text[-3])
    o.write("%s" % '\n'.join(cmds))
cmds[0]

'python $HOME/pipeline/02_bwa-map_view_sort_index_flagstat.py /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping E-AL-1-1'

In [50]:
catfile

'/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/01_catfile.txt'

In [51]:
# cat catfile in terminal

In [52]:
cmds[-1]

'python $HOME/pipeline/02_bwa-map_view_sort_index_flagstat.py /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping T-VA-4-9'

In [56]:
# get the mapping shfiles to create the mark-dups shfiles
shdir = op.join(denovodir, 'BURT/shfiles/02_bwa_shfiles')
mapshfiles = fs(shdir, endswith='.sh')
len(mapshfiles)

1135

In [57]:
read(mapshfiles[0])[-2]

'#python $HOME/pipeline/03_mark_build.py /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT E-AL-1-1'

In [58]:
# write python cmds to a file, cat this file to GNU parallel in terminal window
catfile = op.join(denovodir, 'BURT/02_catfile.txt')
with open(catfile, 'w') as o:
    cmds = []
    for sh in mapshfiles:
        text = read(sh)
        cmds.append(text[-2][1:])
    o.write("%s" % '\n'.join(cmds))
cmds[0]

'python $HOME/pipeline/03_mark_build.py /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT E-AL-1-1'

#### now go through and submit new trim jobs after marking dups finishes in another job

In [59]:
dupdir = op.join(denovodir, 'BURT/shfiles/03_mark_build_shfiles')
dupshfiles = fs(dupdir, endswith='.sh')
len(dupshfiles)

1135

In [60]:
# identify chains of jobs so that once one job finishes (trim + map) it submits the next trim job in the chain
chains = {}
lencount = 1
itercount = 0
for sh in nb(dupshfiles):
    if len(chains) < 150:
        chains[sh] = []
    else:
        for shstart in chains:
            if len(chains[shstart]) < lencount:
                sh = sh.replace("/03_mark_build_shfiles/", "/01_trimmed_shfiles/").replace("-mark.sh", "-trim.sh")
                assert op.exists(sh)
                chains[shstart].append(sh)
                itercount += 1
                if itercount % 150 == 0 and itercount > 0:
                    itercount = 0
                    lencount += 1
                break

100%|██████████| 1135/1135 [00:01<00:00, 911.10it/s]


In [61]:
sh

'/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-VA-4-9-trim.sh'

In [62]:
# make sure it makes sense
allsh = []
count = len(chains)
for sh,lst in chains.items():
    count += len(lst)
    allsh.append(sh)
    allsh.extend(lst)
count, len(dupshfiles), len(allsh), luni(allsh)

(1135, 1135, 1135, 1135)

In [63]:
# look at the first chain of job submissions
k0 = keys(chains)[0]
chains[k0]

['/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-E-SC-1-9-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-G-VA-1-11-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-AL-5-3-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-LA-3-12-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-AL-3-6-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-GA-3-10-trim.sh',
 '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-TX-1-3-trim.sh']

In [64]:
# add in commands to qsub the next shfile once the last shfile is finished mapping
count = 0
for sh,lst in chains.items():
    lastsh = sh
    for nextsh in lst:
        if count == 0:
            print(op.basename(lastsh), ' will submit ', op.basename(nextsh))  # for visualization
        # get the text from the sh file, add in qsub commands
        text = read(lastsh)
        if '# submit next job' not in text:
            text.append('# submit next job')
            text.append('cd %s' % op.dirname(nextsh))
            text.append('qsub %s' % op.basename(nextsh))
        # write the text back to file
        with open(lastsh, 'w') as o:
            o.write("%s" % '\n'.join(text))
        # replace lastsh by changing nextsh from trim to mark
        lastsh = nextsh.replace("/01_trimmed_shfiles/", "/03_mark_build_shfiles/").replace("-trim.sh", "-mark.sh")
    count += 1

BURT-E-AL-1-1-mark.sh  will submit  BURT-E-SC-1-9-trim.sh
BURT-E-SC-1-9-mark.sh  will submit  BURT-G-VA-1-11-trim.sh
BURT-G-VA-1-11-mark.sh  will submit  BURT-P-AL-5-3-trim.sh
BURT-P-AL-5-3-mark.sh  will submit  BURT-P-LA-3-12-trim.sh
BURT-P-LA-3-12-mark.sh  will submit  BURT-T-AL-3-6-trim.sh
BURT-T-AL-3-6-mark.sh  will submit  BURT-T-GA-3-10-trim.sh
BURT-T-GA-3-10-mark.sh  will submit  BURT-T-TX-1-3-trim.sh


In [65]:
# go back through trim and mapping jobs and replace python script command with qsub command
# so that the python scripts don't overwrite the changes I've made to the sh files
dirs = ['/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles',
        '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/02_bwa_shfiles']
for d in dirs:
    print(d)
    shfiles = fs(d, endswith='.sh')
    # fore each sh file, comment python cmd and add qsub command for next phase
    for sh in nb(shfiles):
        text = read(sh)
        for i,line in enumerate(text):
            if line.startswith('python'):
                text[i] = "#%s" % line
        if '01_trimmed_shfiles' in d:
            nextsh = sh.replace('/01_trimmed_shfiles/', '/02_bwa_shfiles/').replace('-trim.sh', '-bwa.sh')
        elif '02_bwa_shfiles' in d:
            nextsh = sh.replace('/02_bwa_shfiles/', '/03_mark_build_shfiles/').replace('-bwa.sh', '-mark.sh')
        else:
            print('continuing')
            continue
        assert op.exists(nextsh)
        text.extend(['', '# submit next newest job', 'cd %s' % op.dirname(nextsh)])
        text.append('qsub %s' % op.basename(nextsh))
        with open(sh, 'w') as o:
            o.write("%s" % '\n'.join(text))

  2%|▏         | 27/1135 [00:00<00:04, 263.73it/s]

/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles


100%|██████████| 1135/1135 [00:03<00:00, 330.48it/s]
  3%|▎         | 36/1135 [00:00<00:03, 358.64it/s]

/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/02_bwa_shfiles


100%|██████████| 1135/1135 [00:03<00:00, 343.32it/s]


In [66]:
text

['#!/bin/bash',
 '#$ -cwd',
 '#$ -j y',
 '#$ -N BURT-T-VA-4-9-bwa',
 '#$ -l mem_free=55000M',
 '#$ -pe smp_2 32',
 '#$ -S /bin/bash',
 '#$ -M lindb@vcu.edu',
 '#$ -m a',
 '',
 '# get RGID and RGPU',
 'RGID=T-VA-4-9.Novogene1.BURT_12',
 'RGPU=T-VA-4-9.Novogene1.BURT_12',
 '',
 '# map, sam to bam, sort by coordinate, index',
 '',
 'bwa mem -t 32 -M -R "@RG\\tID:$RGID\\tSM:T-VA-4-9\\tPL:ILLUMINA\\tLB:BURT_12\\tPU:$RGPU" /home/lindb/eckertlab/BURT/refs/dDocent_denovo_assembly/reference_pseudo.fasta /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/01_trimmed/T-VA-4-9_R1_trimmed.fastq.gz  > /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/02a_samfiles/T-VA-4-9_R1R2_trimmed.sam',
 '',
 '',
 '',
 'samtools view -@ 32 -q 20 -F 0x0004  -Sb /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/02a_samfiles/T-VA-4-9_R1R2_trimmed.sam > /gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/02b_bamfiles/T-VA-4-9_R1R2_trimmed.bam',
 'samtools s

In [67]:
len(chains)

150

In [69]:
# add in source ~/.bash_profile before bash commands in shfiles - because godel is stupid
dirs = ['/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles',
        '/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/02_bwa_shfiles',
        '/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/03_mark_build_shfiles']
for i,d in enumerate(dirs):
    print(d)
    shfiles = fs(d, endswith='.sh')
    for sh in nb(shfiles):
        text = read(sh)
        text.insert(10, 'source $HOME/.bash_profile')
        with open(sh, 'w') as o:
            o.write("%s" % '\n'.join(text))
text

  8%|▊         | 91/1135 [00:00<00:01, 904.66it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/01_trimmed_shfiles


100%|██████████| 1135/1135 [00:01<00:00, 743.23it/s]
  7%|▋         | 80/1135 [00:00<00:01, 796.57it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/02_bwa_shfiles


100%|██████████| 1135/1135 [00:01<00:00, 808.77it/s]
  6%|▌         | 66/1135 [00:00<00:01, 655.15it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/shfiles/03_mark_build_shfiles


100%|██████████| 1135/1135 [00:01<00:00, 568.70it/s]


['#!/bin/bash',
 '#$ -cwd',
 '#$ -j y',
 '#$ -l mem_free=30000M',
 '#$ -N BURT-T-VA-4-9-mark',
 '#$ -S /bin/bash',
 '#$ -M lindb@vcu.edu',
 '#$ -m a',
 '',
 '# remove dups',
 'source $HOME/.bash_profile',
 '',
 '',
 'export _JAVA_OPTIONS="-Xms256m -Xmx27g"',
 'picard MarkDuplicates I=/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/02c_sorted_bamfiles/T-VA-4-9_R1R2_trimmed_sorted.bam O=/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd.bam MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 M=/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd_dupstat.txt REMOVE_DUPLICATES=true',
 '',
 '# Build bam index for GATK',
 'java -jar $EBROOTPICARD/picard.jar BuildBamIndex I=/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd.bam',
 '',
 '',
 '# get more dup stats',
 '',
 'samto

In [71]:
??qsub

In [73]:
# submit the sh files that kick off the rest of the commands
pids = []
for sh in chains.keys():
    sh = sh.replace('/03_mark_build_shfiles/', '/01_trimmed_shfiles/').replace("-mark.sh", "-trim.sh")
    assert op.exists(sh)
    pids.extend(qsub(sh))

100%|██████████| 1/1 [00:00<00:00,  4.63it/s]
100%|██████████| 1/1 [00:00<00:00, 35.89it/s]
100%|██████████| 1/1 [00:00<00:00, 31.85it/s]
100%|██████████| 1/1 [00:00<00:00, 35.82it/s]
100%|██████████| 1/1 [00:00<00:00, 22.72it/s]
100%|██████████| 1/1 [00:00<00:00, 24.21it/s]
100%|██████████| 1/1 [00:00<00:00, 35.48it/s]
100%|██████████| 1/1 [00:00<00:00, 28.12it/s]
100%|██████████| 1/1 [00:00<00:00, 35.84it/s]
100%|██████████| 1/1 [00:00<00:00, 27.60it/s]
100%|██████████| 1/1 [00:00<00:00, 34.97it/s]
100%|██████████| 1/1 [00:00<00:00, 27.75it/s]
100%|██████████| 1/1 [00:00<00:00, 37.37it/s]
100%|██████████| 1/1 [00:00<00:00, 28.33it/s]
100%|██████████| 1/1 [00:00<00:00, 34.83it/s]
100%|██████████| 1/1 [00:00<00:00, 28.92it/s]
100%|██████████| 1/1 [00:00<00:00, 36.93it/s]
100%|██████████| 1/1 [00:00<00:00, 28.54it/s]
100%|██████████| 1/1 [00:00<00:00, 38.76it/s]
100%|██████████| 1/1 [00:00<00:00, 30.90it/s]
100%|██████████| 1/1 [00:00<00:00, 35.76it/s]
100%|██████████| 1/1 [00:00<00:00,

In [74]:
pids

['2703115',
 '2703116',
 '2703117',
 '2703118',
 '2703119',
 '2703120',
 '2703121',
 '2703122',
 '2703123',
 '2703124',
 '2703125',
 '2703126',
 '2703127',
 '2703128',
 '2703129',
 '2703130',
 '2703131',
 '2703132',
 '2703133',
 '2703134',
 '2703135',
 '2703136',
 '2703137',
 '2703138',
 '2703139',
 '2703140',
 '2703141',
 '2703142',
 '2703143',
 '2703144',
 '2703145',
 '2703146',
 '2703147',
 '2703148',
 '2703149',
 '2703150',
 '2703151',
 '2703152',
 '2703153',
 '2703154',
 '2703155',
 '2703156',
 '2703157',
 '2703158',
 '2703159',
 '2703160',
 '2703161',
 '2703162',
 '2703163',
 '2703164',
 '2703165',
 '2703166',
 '2703167',
 '2703168',
 '2703169',
 '2703170',
 '2703171',
 '2703172',
 '2703173',
 '2703174',
 '2703175',
 '2703176',
 '2703177',
 '2703178',
 '2703179',
 '2703180',
 '2703181',
 '2703182',
 '2703183',
 '2703184',
 '2703185',
 '2703186',
 '2703187',
 '2703188',
 '2703189',
 '2703190',
 '2703191',
 '2703192',
 '2703193',
 '2703194',
 '2703195',
 '2703196',
 '2703197',
 '27

# <center> trim and map to loblolly reference genome </center>

In [4]:
# read in the previous table used to map to de novo assembly
DIR = '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping'
tablefile = op.join(op.dirname(DIR), 'denovo_mapping/datatable.txt')
dt = pd.read_table(tablefile)
dt.head()

Unnamed: 0,sample_name,library_name,pool_name,ploidy,file_name_r1,file_name_r2,adaptor_1,adaptor_2,ref,rgid,rglb,rgpl,rgpu,rgsm
0,T-AR-2-7,Burt0,BURT,2,T-AR-2-7_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-AR-2-7.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-AR-2-7,T-AR-2-7
1,T-TX-1-19,Burt0,BURT,2,T-TX-1-19_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-TX-1-19.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-TX-1-19,T-TX-1-19
2,G-SC-1-3,Burt0,BURT,2,G-SC-1-3_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,G-SC-1-3.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.G-SC-1-3,G-SC-1-3
3,P-LA-2-10,Burt0,BURT,2,P-LA-2-10_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,P-LA-2-10.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.P-LA-2-10,P-LA-2-10
4,T-AL-1-2,Burt0,BURT,2,T-AL-1-2_R1.fastq.gz,,,,/home/lindb/eckertlab/BURT/refs/dDocent_denovo...,T-AL-1-2.160520.Burt0,Burt0,ILLUMINA,C7WRGACXX.1.T-AL-1-2,T-AL-1-2


In [5]:
# change the reference and save
dt['ref'] = '/home/lindb/eckertlab/BURT/refs/lobololly_stitched_ref/Pita2_stitched_2/pita2_stitch_v2.fa'
dt.to_csv(tablefile.replace("denovo_mapping", "loblolly_mapping"), sep='\t', index=False)

In [6]:
# get symlinks to fastqfiles
oldfastq = {}
for f in fs(op.join(op.dirname(DIR), 'denovo_mapping'), endswith='fastq.gz'):
    oldfastq[op.realpath(f)] = op.basename(f)
print(len(oldfastq))
# put symlinks in new dir
for src,bname in oldfastq.items():
    dst = op.join(DIR, bname)
    os.symlink(src, dst)

1709


In [7]:
# match command from de novo run
pklload(op.join(op.dirname(DIR), 'denovo_mapping/pipeline_start_command.pkl'))

Namespace(email='lindb@vcu.edu', email_options=['fail'], maf=None, paralogs=False, parentdir='/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/denovo_mapping', repeats=False, sge=True, translate=True)

```bash
# run in command line
python 00_start-pipeline.py -p /home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping -e lindb@vcu.edu -n fail --sge --translate
```

# chain up the .sh files

godel only allows 200 job submissions, so I'm going to create chains of jobs so that once one job finishes mapping it submits a trim job.

#### first run each python script to create the sh files for the next step (through marking duplicates)

I went into the pipeline and commented out the part where it submits the jobs so that I could run each python script

In [10]:
lobdir = DIR
lobdir

'/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping'

In [11]:
# get the trimming shfiles to create the mapping shfiles
shdir = op.join(lobdir, 'BURT/shfiles/01_trimmed_shfiles')
trimshfiles = fs(shdir, endswith='.sh')
len(trimshfiles)

1135

In [12]:
# write python cmds to a file, cat this file to GNU parallel in terminal window
catfile = op.join(lobdir, 'BURT/01_catfile.txt')
with open(catfile, 'w') as o:
    cmds = []
    for sh in trimshfiles:
        text = read(sh)
        cmds.append(text[-3])
    o.write("%s" % '\n'.join(cmds))
cmds[0]

'python $HOME/pipeline/02_bwa-map_view_sort_index_flagstat.py /home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping E-AL-1-1'

In [13]:
catfile

'/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/01_catfile.txt'

In [14]:
# cat catfile in terminal

In [15]:
# get the mapping shfiles to create the mark-dups shfiles
shdir = op.join(lobdir, 'BURT/shfiles/02_bwa_shfiles')
mapshfiles = fs(shdir, endswith='.sh')
len(mapshfiles)

1135

In [16]:
# write python cmds to a file, cat this file to GNU parallel in terminal window
catfile = op.join(lobdir, 'BURT/02_catfile.txt')
with open(catfile, 'w') as o:
    cmds = []
    for sh in mapshfiles:
        text = read(sh)
        cmds.append(text[-2][1:])
    o.write("%s" % '\n'.join(cmds))
cmds[0]

'python $HOME/pipeline/03_mark_build.py /home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT E-AL-1-1'

#### now go through and submit new trim jobs after marking dups finishes in another job

In [17]:
dupdir = op.join(lobdir, 'BURT/shfiles/03_mark_build_shfiles')
dupshfiles = fs(dupdir, endswith='.sh')
len(dupshfiles)

1135

In [18]:
# identify chains of jobs so that once one job finishes (trim + map) it submits the next trim job in the chain
chains = {}
lencount = 1
itercount = 0
for sh in nb(dupshfiles):
    if len(chains) < 150:
        chains[sh] = []
    else:
        for shstart in chains:
            if len(chains[shstart]) < lencount:
                sh = sh.replace("/03_mark_build_shfiles/", "/01_trimmed_shfiles/").replace("-mark.sh", "-trim.sh")
                assert op.exists(sh)
                chains[shstart].append(sh)
                itercount += 1
                if itercount % 150 == 0 and itercount > 0:
                    itercount = 0
                    lencount += 1
                break

100%|██████████| 1135/1135 [00:01<00:00, 797.20it/s]


In [19]:
len(keys(chains))

150

In [20]:
# make sure it makes sense
allsh = []
count = len(chains)
for sh,lst in chains.items():
    count += len(lst)
    allsh.append(sh)
    allsh.extend(lst)
count, len(dupshfiles), len(allsh), luni(allsh)

(1135, 1135, 1135, 1135)

In [21]:
# look at the first chain of job submissions
k0 = keys(chains)[0]
chains[k0]

['/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-E-SC-1-9-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-G-VA-1-11-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-AL-5-3-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-LA-3-12-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-AL-3-6-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-GA-3-10-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-TX-1-3-trim.sh']

In [22]:
# add in commands to qsub the next shfile once the last shfile is finished mapping
count = 0
for sh,lst in chains.items():
    lastsh = sh
    for nextsh in lst:
        if count == 0:
            print(op.basename(lastsh), ' will submit ', op.basename(nextsh))  # for visualization
        # get the text from the sh file, add in qsub commands
        text = read(lastsh)
        if '# submit next job' not in text:
            text.append('# submit next job')
            text.append('cd %s' % op.dirname(nextsh))
            text.append('qsub %s' % op.basename(nextsh))
        # write the text back to file
        with open(lastsh, 'w') as o:
            o.write("%s" % '\n'.join(text))
        # replace lastsh by changing nextsh from trim to mark
        lastsh = nextsh.replace("/01_trimmed_shfiles/", "/03_mark_build_shfiles/").replace("-trim.sh", "-mark.sh")
    count += 1

BURT-E-AL-1-1-mark.sh  will submit  BURT-E-SC-1-9-trim.sh
BURT-E-SC-1-9-mark.sh  will submit  BURT-G-VA-1-11-trim.sh
BURT-G-VA-1-11-mark.sh  will submit  BURT-P-AL-5-3-trim.sh
BURT-P-AL-5-3-mark.sh  will submit  BURT-P-LA-3-12-trim.sh
BURT-P-LA-3-12-mark.sh  will submit  BURT-T-AL-3-6-trim.sh
BURT-T-AL-3-6-mark.sh  will submit  BURT-T-GA-3-10-trim.sh
BURT-T-GA-3-10-mark.sh  will submit  BURT-T-TX-1-3-trim.sh


In [23]:
# go back through trim and mapping jobs and replace python script command with qsub command
# so that the python scripts don't overwrite the changes I've made to the sh files
dirs = ['/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles',
        '/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/02_bwa_shfiles']
for d in dirs:
    print(d)
    shfiles = fs(d, endswith='.sh')
    # fore each sh file, comment python cmd and add qsub command for next phase
    for sh in nb(shfiles):
        text = read(sh)
        for i,line in enumerate(text):
            if line.startswith('python'):
                text[i] = "#%s" % line
        if '01_trimmed_shfiles' in d:
            nextsh = sh.replace('/01_trimmed_shfiles/', '/02_bwa_shfiles/').replace('-trim.sh', '-bwa.sh')
        elif '02_bwa_shfiles' in d:
            nextsh = sh.replace('/02_bwa_shfiles/', '/03_mark_build_shfiles/').replace('-bwa.sh', '-mark.sh')
        else:
            print('continuing')
            continue
        assert op.exists(nextsh)
        text.extend(['', '# submit next newest job', 'cd %s' % op.dirname(nextsh)])
        text.append('qsub %s' % op.basename(nextsh))
        with open(sh, 'w') as o:
            o.write("%s" % '\n'.join(text))

  0%|          | 4/1135 [00:00<00:29, 38.95it/s]

/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles


100%|██████████| 1135/1135 [00:04<00:00, 250.72it/s]
  3%|▎         | 38/1135 [00:00<00:02, 374.78it/s]

/gpfs_fs/home/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/02_bwa_shfiles


100%|██████████| 1135/1135 [00:03<00:00, 337.16it/s]


In [24]:
# add in source ~/.bash_profile before bash commands in shfiles - because godel is stupid
dirs = ['/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles',
        '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/02_bwa_shfiles',
        '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/03_mark_build_shfiles']
for i,d in enumerate(dirs):
    print(d)
    shfiles = fs(d, endswith='.sh')
    for sh in nb(shfiles):
        text = read(sh)
        text.insert(10, 'source $HOME/.bash_profile')
        with open(sh, 'w') as o:
            o.write("%s" % '\n'.join(text))
text

  7%|▋         | 82/1135 [00:00<00:01, 819.65it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles


100%|██████████| 1135/1135 [00:01<00:00, 760.29it/s]
  9%|▊         | 97/1135 [00:00<00:01, 960.90it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/02_bwa_shfiles


100%|██████████| 1135/1135 [00:01<00:00, 877.24it/s]
  7%|▋         | 78/1135 [00:00<00:01, 772.00it/s]

/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/03_mark_build_shfiles


100%|██████████| 1135/1135 [00:02<00:00, 495.14it/s]


['#!/bin/bash',
 '#$ -cwd',
 '#$ -j y',
 '#$ -l mem_free=30000M',
 '#$ -N BURT-T-VA-4-9-mark',
 '#$ -S /bin/bash',
 '#$ -M lindb@vcu.edu',
 '#$ -m a',
 '',
 '# remove dups',
 'source $HOME/.bash_profile',
 '',
 '',
 'export _JAVA_OPTIONS="-Xms256m -Xmx27g"',
 'picard MarkDuplicates I=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/02c_sorted_bamfiles/T-VA-4-9_R1R2_trimmed_sorted.bam O=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd.bam MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 M=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd_dupstat.txt REMOVE_DUPLICATES=true',
 '',
 '# Build bam index for GATK',
 'java -jar $EBROOTPICARD/picard.jar BuildBamIndex I=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/T-VA-4-9_rd.bam',
 '',
 '',
 '# get more dup stats',
 '',
 'samto

In [25]:
read(shfiles[0])

['#!/bin/bash',
 '#$ -cwd',
 '#$ -j y',
 '#$ -l mem_free=30000M',
 '#$ -N BURT-E-AL-1-1-mark',
 '#$ -S /bin/bash',
 '#$ -M lindb@vcu.edu',
 '#$ -m a',
 '',
 '# remove dups',
 'source $HOME/.bash_profile',
 '',
 '',
 'export _JAVA_OPTIONS="-Xms256m -Xmx27g"',
 'picard MarkDuplicates I=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/02c_sorted_bamfiles/E-AL-1-1_R1R2_trimmed_sorted.bam O=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/E-AL-1-1_rd.bam MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 M=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/E-AL-1-1_rd_dupstat.txt REMOVE_DUPLICATES=true',
 '',
 '# Build bam index for GATK',
 'java -jar $EBROOTPICARD/picard.jar BuildBamIndex I=/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/E-AL-1-1_rd.bam',
 '',
 '',
 '# get more dup stats',
 '',
 'samto

In [27]:
k0 = keys(chains)[0]
print(k0)
chains[k0]

/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/03_mark_build_shfiles/BURT-E-AL-1-1-mark.sh


['/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-E-SC-1-9-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-G-VA-1-11-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-AL-5-3-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-P-LA-3-12-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-AL-3-6-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-GA-3-10-trim.sh',
 '/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/shfiles/01_trimmed_shfiles/BURT-T-TX-1-3-trim.sh']

In [28]:
# submit the sh files that kick off the rest of the commands
pids = []
for sh in chains.keys():
    sh = sh.replace('/03_mark_build_shfiles/', '/01_trimmed_shfiles/').replace("-mark.sh", "-trim.sh")
    assert op.exists(sh)
    pids.extend(qsub(sh))

100%|██████████| 1/1 [00:00<00:00,  4.63it/s]
100%|██████████| 1/1 [00:00<00:00, 27.17it/s]
100%|██████████| 1/1 [00:00<00:00, 30.66it/s]
100%|██████████| 1/1 [00:00<00:00, 33.55it/s]
100%|██████████| 1/1 [00:00<00:00, 32.64it/s]
100%|██████████| 1/1 [00:00<00:00, 27.04it/s]
100%|██████████| 1/1 [00:00<00:00, 31.93it/s]
100%|██████████| 1/1 [00:00<00:00, 29.94it/s]
100%|██████████| 1/1 [00:00<00:00, 30.17it/s]
100%|██████████| 1/1 [00:00<00:00, 30.08it/s]
100%|██████████| 1/1 [00:00<00:00, 31.47it/s]
100%|██████████| 1/1 [00:00<00:00, 30.66it/s]
100%|██████████| 1/1 [00:00<00:00, 31.69it/s]
100%|██████████| 1/1 [00:00<00:00, 29.46it/s]
100%|██████████| 1/1 [00:00<00:00, 31.37it/s]
100%|██████████| 1/1 [00:00<00:00, 30.71it/s]
100%|██████████| 1/1 [00:00<00:00, 29.30it/s]
100%|██████████| 1/1 [00:00<00:00, 31.49it/s]
100%|██████████| 1/1 [00:00<00:00, 27.82it/s]
100%|██████████| 1/1 [00:00<00:00, 31.89it/s]
100%|██████████| 1/1 [00:00<00:00, 31.27it/s]
100%|██████████| 1/1 [00:00<00:00,

In [29]:
len(pids)

150

In [30]:
pids

['2706561',
 '2706562',
 '2706563',
 '2706564',
 '2706565',
 '2706566',
 '2706567',
 '2706568',
 '2706569',
 '2706570',
 '2706571',
 '2706572',
 '2706573',
 '2706574',
 '2706575',
 '2706576',
 '2706577',
 '2706578',
 '2706579',
 '2706580',
 '2706581',
 '2706582',
 '2706583',
 '2706584',
 '2706585',
 '2706586',
 '2706587',
 '2706588',
 '2706589',
 '2706590',
 '2706591',
 '2706592',
 '2706593',
 '2706594',
 '2706595',
 '2706596',
 '2706597',
 '2706598',
 '2706599',
 '2706600',
 '2706601',
 '2706602',
 '2706603',
 '2706604',
 '2706605',
 '2706606',
 '2706607',
 '2706608',
 '2706609',
 '2706610',
 '2706611',
 '2706612',
 '2706613',
 '2706614',
 '2706615',
 '2706616',
 '2706617',
 '2706618',
 '2706619',
 '2706620',
 '2706621',
 '2706622',
 '2706623',
 '2706624',
 '2706625',
 '2706626',
 '2706627',
 '2706628',
 '2706629',
 '2706630',
 '2706631',
 '2706632',
 '2706633',
 '2706634',
 '2706635',
 '2706636',
 '2706637',
 '2706638',
 '2706639',
 '2706640',
 '2706641',
 '2706642',
 '2706643',
 '27

#### see what didn't finish

In [5]:
lobdir

'/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping'

In [6]:
markfiles = fs(op.join(lobdir, 'BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles'), endswith='.bam')
len(markfiles)

1127

In [8]:
markfiles[0]

'/home/lindb/eckertlab/BURT/002_trim_and_map/loblolly_mapping/BURT/03_dedup_rg_filtered_indexed_sorted_bamfiles/E-AL-1-10_rd.bam'

In [14]:
samps = uni(pd.read_table(op.join(lobdir, 'datatable.txt'))['sample_name'])
found = {}
for samp in nb(samps):
    samp = samp+'_rd.bam'
    found[samp] = []
    for mark in markfiles:
        if samp == op.basename(mark):
            found[samp].append(mark)
for samp,marklst in found.items():
    assert len(marklst) <= 1

100%|██████████| 1135/1135 [00:07<00:00, 154.17it/s]


In [15]:
for samp,marklst in found.items():
    if len(marklst) == 0:
        print(samp)

G-VA-2-2_rd.bam
E-FL-1-10_rd.bam
E-SC-3-1_rd.bam
T-TX-2-21_rd.bam
T-AL-4-8_rd.bam
P-LA-4-13_rd.bam
P-FL-1-2_rd.bam
T-GA-4-11_rd.bam


In [None]:
E-FL-1-10_rd.bam -> E-SC-3-1_rd.bam -> G-VA-2-2_rd.bam

In [None]:
G-VA-2-2_rd.bam -> P-FL-1-2_rd.bam -> P-LA-4-13_rd.bam -> T-AL-4-8_rd.bam -> T-GA-4-11_rd.bam -> T-TX-2-21_rd.bam