In [1]:
import os,re
from collections import defaultdict
from igf_data.illumina.samplesheet import SampleSheet
import pandas as pd

In [2]:
samplesheet_filename='SampleSheet.csv'
seqrun_igf_id='171006_NB501820_0009_AHTGYKAFXX'
model_name='NEXTSEQ'
fastq_dir='../../test_dir/test9_collect_fastq/nextseq_test/fastq/1_16'

In [3]:
def get_fastq_and_samplesheet(fastq_dir, samplesheet_filename):
    r1_fastq_regex=re.compile(r'\S+_R1_\d+\.fastq(\.gz)?', re.IGNORECASE)
    r2_fastq_regex=re.compile(r'\S+_R2_\d+\.fastq(\.gz)?', re.IGNORECASE)
    
    samplesheet_list=list()
    r1_fastq_list=list()
    r2_fastq_list=list()
    
    for root, dirs, files in os.walk(top=fastq_dir, topdown=True):
        if samplesheet_filename in files:
            samplesheet_list.append(os.path.join(root,samplesheet_filename))
        for file in files:
            if r1_fastq_regex.match(file):
                r1_fastq_list.append(os.path.join(root,file))
            elif r2_fastq_regex.match(file):
                r2_fastq_list.append(os.path.join(root,file))
                
    if len(r2_fastq_list) > 0 and len(r1_fastq_list) != len(r2_fastq_list):
        raise ValueError('R1 {0} and R2 {1}'.format(len(r1_fastq_list),len(r2_fastq_list)))
        
    if len(samplesheet_list) > 1:
        raise ValueError('Found more than one samplesheet file for fastq dir {0}'.format(fastq_dir))
        
    return samplesheet_list[0], r1_fastq_list, r2_fastq_list

In [4]:
def link_fastq_file_to_sample(sample_name,r1_fastq_list, r2_fastq_list):
    #sample_files=defaultdict(lambda: defaultdict(lambda: defaultdict()))
    sample_files=defaultdict(lambda: defaultdict(lambda: defaultdict()))
    r1_regex=re.compile(sample_name+'_S\d+_L(\d+)_R1_\d+\.fastq(\.gz)?',re.IGNORECASE)
    for file1 in r1_fastq_list:
        if r1_regex.match(os.path.basename(file1)):
            m=r1_regex.match(os.path.basename(file1))
            lane_id=m.group(1).strip('0')
            sample_files[lane_id]['R1']=file1
            
    if len(r2_fastq_list) > 0:
        r2_regex=re.compile(sample_name+'_S\d+_L(\d+)_R2_\d+\.fastq(\.gz)?',re.IGNORECASE)
        for file2 in r2_fastq_list:
            if r2_regex.match(os.path.basename(file2)):
                m=r2_regex.match(os.path.basename(file2))
                lane_id=m.group(1).strip('0')
                sample_files[lane_id]['R2']=file2
    return sample_files

In [5]:
def collect_fastq_and_sample_info(fastq_dir,samplesheet_filename,seqrun_igf_id,model_name):
    (samplesheet_file, r1_fastq_list, r2_fastq_list)=get_fastq_and_samplesheet(fastq_dir, samplesheet_filename)
    samplesheet_data=SampleSheet(infile=samplesheet_file)
    fastq_files_list=list()
    for row in samplesheet_data._data:
        sample_name=row['Sample_Name']
        sample_id=row['Sample_ID']
        project_name=row['Sample_Project']
        sample_files=link_fastq_file_to_sample(sample_name,r1_fastq_list, r2_fastq_list)
        for lane, lane_files in sample_files.items():
            fastq_info={'sample_id':sample_id,
                        'sample_name':sample_name,
                        'project_name':project_name,
                        'lane_id':lane,
                        'seqrun_igf_id':seqrun_igf_id,
                        'model_name':model_name,
                        }
            for read_type, filepath in lane_files.items():
                fastq_info.update({read_type:filepath})     # allowing only one file per lane per read type
            fastq_files_list.append(fastq_info)             # adding entries per samle per lane
    return fastq_files_list

In [6]:
fastq_files_list=collect_fastq_and_sample_info(fastq_dir,samplesheet_filename,seqrun_igf_id,model_name)

In [7]:
pd.DataFrame(fastq_files_list).head()

Unnamed: 0,R1,R2,lane_id,model_name,project_name,sample_id,sample_name,seqrun_igf_id
0,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,3,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
1,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,2,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
2,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,1,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
3,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,4,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
4,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,3,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007143_QXT,ctrl_81_G9_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
