In [1]:
import os,re
from collections import defaultdict
from igf_data.illumina.samplesheet import SampleSheet
import pandas as pd

In [2]:
samplesheet_filename='SampleSheet.csv'
seqrun_igf_id='171006_NB501820_0009_AHTGYKAFXX'
model_name='NEXTSEQ'
flowcell_id='HTGYKAFXX'
fastq_dir='../../test_dir/test9_collect_fastq/nextseq_test/fastq/1_16'

In [3]:
def get_fastq_and_samplesheet(fastq_dir, samplesheet_filename):
    r1_fastq_regex=re.compile(r'\S+_R1_\d+\.fastq(\.gz)?', re.IGNORECASE)
    r2_fastq_regex=re.compile(r'\S+_R2_\d+\.fastq(\.gz)?', re.IGNORECASE)
    
    samplesheet_list=list()
    r1_fastq_list=list()
    r2_fastq_list=list()
    
    for root, dirs, files in os.walk(top=fastq_dir, topdown=True):
        if samplesheet_filename in files:
            samplesheet_list.append(os.path.join(root,samplesheet_filename))
        for file in files:
            if r1_fastq_regex.match(file):
                r1_fastq_list.append(os.path.join(root,file))
            elif r2_fastq_regex.match(file):
                r2_fastq_list.append(os.path.join(root,file))
                
    if len(r2_fastq_list) > 0 and len(r1_fastq_list) != len(r2_fastq_list):
        raise ValueError('R1 {0} and R2 {1}'.format(len(r1_fastq_list),len(r2_fastq_list)))
        
    if len(samplesheet_list) > 1:
        raise ValueError('Found more than one samplesheet file for fastq dir {0}'.format(fastq_dir))
        
    return samplesheet_list[0], r1_fastq_list, r2_fastq_list

In [4]:
def link_fastq_file_to_sample(sample_name,r1_fastq_list, r2_fastq_list):
    sample_files=defaultdict(lambda: defaultdict(lambda: defaultdict()))
    r1_regex=re.compile(sample_name+'_S\d+_L(\d+)_R1_\d+\.fastq(\.gz)?',re.IGNORECASE)
    for file1 in r1_fastq_list:
        if r1_regex.match(os.path.basename(file1)):
            m=r1_regex.match(os.path.basename(file1))
            lane_id=m.group(1).strip('0')
            sample_files[lane_id]['R1']=file1
            
    if len(r2_fastq_list) > 0:
        r2_regex=re.compile(sample_name+'_S\d+_L(\d+)_R2_\d+\.fastq(\.gz)?',re.IGNORECASE)
        for file2 in r2_fastq_list:
            if r2_regex.match(os.path.basename(file2)):
                m=r2_regex.match(os.path.basename(file2))
                lane_id=m.group(1).strip('0')
                sample_files[lane_id]['R2']=file2
    return sample_files

In [5]:
def collect_fastq_and_sample_info(fastq_dir,samplesheet_filename,seqrun_igf_id,model_name):
    (samplesheet_file, r1_fastq_list, r2_fastq_list)=get_fastq_and_samplesheet(fastq_dir, samplesheet_filename)
    samplesheet_data=SampleSheet(infile=samplesheet_file)
    fastq_files_list=list()
    for row in samplesheet_data._data:
        sample_name=row['Sample_Name']
        sample_id=row['Sample_ID']
        project_name=row['Sample_Project']
        description=row['Description']
        sample_files=link_fastq_file_to_sample(sample_name,r1_fastq_list, r2_fastq_list)
        for lane, lane_files in sample_files.items():
            fastq_info={'sample_id':sample_id,
                        'sample_name':sample_name,
                        'project_name':project_name,
                        'lane_number':lane,
                        'seqrun_igf_id':seqrun_igf_id,
                        'platform_name':model_name,
                        'flowcell_id':flowcell_id,
                        'description':description
                        }
            for read_type, filepath in lane_files.items():
                fastq_info.update({read_type:filepath})     # allowing only one file per lane per read type
            fastq_files_list.append(fastq_info)             # adding entries per samle per lane
    return fastq_files_list

In [6]:
fastq_files_list=collect_fastq_and_sample_info(fastq_dir,samplesheet_filename,seqrun_igf_id,model_name)

In [7]:
pd.DataFrame(fastq_files_list).head(2)

Unnamed: 0,R1,R2,description,flowcell_id,lane_number,platform_name,project_name,sample_id,sample_name,seqrun_igf_id
0,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,,HTGYKAFXX,4,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX
1,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,,HTGYKAFXX,3,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX


In [11]:
def calculate_experiment_and_run_igf_id(data,restricted_list):
    if not isinstance(data, pd.Series):
        data=pd.Series(data)
    library_id=None
    
    if data.description and data.description not in restricted_list:
        library_id=data.description                                    # reassign library id
    else:
        library_id=data.sample_id                                      # keep library id same as sample id
        
    experiment_id='{0}_{1}'.format(library_id,data.platform_name)         # calcaulate experiment id
    data['library_igf_id']=library_id
    data['experiment_igf_id']=experiment_id
    
    run_igf_id='{0}_{1}_{2}'.format(experiment_id, data.flowcell_id, data.lane_number)
    data['run_igf_id']=run_igf_id                                     # calculate run id
    
    library_strategy='SINGLE'
    if 'R1' in data and 'R2' in data and data.R1 is not None and data.R2 is not None:
        library_strategy='PAIRED'
    data['library_strategy']=library_strategy
    return data

In [12]:
def build_exp_run_and_collection(fastq_files_list, restricted_list=['10X']):
    dataframe=pd.DataFrame(fastq_files_list)
    dataframe=dataframe.apply(lambda data: calculate_experiment_and_run_igf_id(data,restricted_list),axis=1)
    return dataframe

In [13]:
dataframe=build_exp_run_and_collection(fastq_files_list)

In [14]:
dataframe.head(2)

Unnamed: 0,R1,R2,description,flowcell_id,lane_number,platform_name,project_name,sample_id,sample_name,seqrun_igf_id,library_igf_id,experiment_igf_id,run_igf_id,library_strategy
0,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,,HTGYKAFXX,4,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX,IGF0007142_QXT,IGF0007142_QXT_NEXTSEQ,IGF0007142_QXT_NEXTSEQ_HTGYKAFXX_4,PAIRED
1,../../test_dir/test9_collect_fastq/nextseq_tes...,../../test_dir/test9_collect_fastq/nextseq_tes...,,HTGYKAFXX,3,NEXTSEQ,ferrer_t2dnoncod-hybcap,IGF0007142_QXT,ctrl_91_H7_MP3913_QXT,171006_NB501820_0009_AHTGYKAFXX,IGF0007142_QXT,IGF0007142_QXT_NEXTSEQ,IGF0007142_QXT_NEXTSEQ_HTGYKAFXX_3,PAIRED
