In [2]:
import glob
import os
import yaml
import pandas as pd


In [3]:
def open_sample_sheet(sample_sheet_fp, lanes=False):
    """Read in an IGM sample sheet and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = 18, header=1)

    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['Lane'].isin(lanes)]

    return(sample_sheet)

In [21]:
def get_read(sample, seq_dir, read):
    """Function to pull a given read based on sample name from the reads directory"""
    read = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
    if len(read) == 1:
        return(read[0])
    else:
        raise ValueError('Too many reads found: {}'.format(read))

In [39]:
sample_sheet_fp = './2016_08_02_Knight_Sample_Sheet_NexteraXT_Katz_EMP5001-4_KF_Ext._Test_HiSeq-2.xls'
lanes = [5,6,7,8]
seq_dir = '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX'

sample_sheet = open_sample_sheet(sample_sheet_fp, lanes=lanes)

sample_sheet

Unnamed: 0,Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
192,5,Vibrio,,,,,,,
193,5,Negative,,,,,,,
194,5,Blanton_host_associated_6,,,,,,,
195,5,Mayer33_sediment_2,,,,,,,
196,5,Bowen74_sed15,,,,,,,
197,5,BH_H1,,,,,,,
198,5,Negative,,,,,,,
199,5,Vibrio,,,,,,,
200,5,Negative,,,,,,,
201,5,NP_LO_7,,,,,,,


In [26]:
default_flow_style = False

Make samples dictionary
------

In [23]:
adaptor = '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa'
phred = 'phred33'

samples_pe = {sample_sheet.loc[x, 'Sample_ID']:
                  {'forward': get_read(sample_sheet.loc[x, 'Sample_ID'], seq_dir, 'R1'),
                   'reverse': get_read(sample_sheet.loc[x, 'Sample_ID'], seq_dir, 'R2'),
                   'adaptor': adaptor,
                   'phred': phred
                  } for x in sample_sheet.index
             }
samples_pe

{'sample1': {'adaptor': '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa',
  'forward': './example/reads/Run1/sample1_S312_R1_L001.fastq.gz',
  'phred': 'phred33',
  'reverse': './example/reads/Run1/sample1_S312_R2_L001.fastq.gz'},
 'sample2': {'adaptor': '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa',
  'forward': './example/reads/Run1/sample2_S521_R1_L001.fastq.gz',
  'phred': 'phred33',
  'reverse': './example/reads/Run1/sample2_S521_R2_L001.fastq.gz'}}

In [37]:
config_str = ''

config_str += yaml.dump({'TMP_DIR_ROOT': '/localscratch'}, default_flow_style = default_flow_style)

config_str += yaml.dump({'RUN': 'test2'}, default_flow_style = default_flow_style)

config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)

config_str += yaml.dump({'software': 
               {'trimmomatic': 'java -jar /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/trimmomatic-0.36.jar',
                'gzip': 'pigz'}
          }, default_flow_style = default_flow_style)

config_str += yaml.dump({'trimmomatic_params': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33'},
                        default_flow_style = default_flow_style)

print(config_str)

TMP_DIR_ROOT: /localscratch
RUN: test2
sample1:
  adaptor: /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa
  forward: ./example/reads/Run1/sample1_S312_R1_L001.fastq.gz
  phred: phred33
  reverse: ./example/reads/Run1/sample1_S312_R2_L001.fastq.gz
sample2:
  adaptor: /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa
  forward: ./example/reads/Run1/sample2_S521_R1_L001.fastq.gz
  phred: phred33
  reverse: ./example/reads/Run1/sample2_S521_R2_L001.fastq.gz
software:
  gzip: pigz
  trimmomatic: java -jar /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/trimmomatic-0.36.jar
trimmomatic_params: LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33

