I would like to qc all of the barcodes sequenced. qc meaning that I would like to see how many errors are being generated in the barcode. However, the counting script is computationally expensive. So, I downsample all the barcode reads so that I take 100k reads from each fastq.

In [58]:
ls fastq

[0m[01;36mHYA__combined__20210323_cortex_phu_dv_etssb_1-1_S1_R2.BCONLY.fastq.gz[0m
[01;36mHYA__combined__20210323_cortex_phu_dv_etssb_1-2_S2_R2.BCONLY.fastq.gz[0m
[01;36mHYA__combined__20210323_cortex_phu_dv_etssb_1-3_S3_R2.BCONLY.fastq.gz[0m
[01;36mHYA__combined__20210323_cortex_phu_dv_etssb_1-4_S4_R2.BCONLY.fastq.gz[0m
[01;36mHYA__combined__20210323_cortex_phu_dv_etssb_1-5_S5_R2.BCONLY.fastq.gz[0m
[01;36mHYR__666332__20210412_mouse_cortex_sa_S1_R2_001.fastq.gz[0m
[01;36mHYR__c47998__20210412_mouse_cortex_sb_S2_R2_001.fastq.gz[0m


In [14]:
module load mawk

split_fastq (){
    local input_fastq_filename="${1}";
    local output_fastq_split1_filename="${2}";
    local split1_start=1;
    local split1_end=10;
    local split2_start=21;
    local split2_end=30;
    local split3_start=41;
    local split3_end=50;
    zcat "${input_fastq_filename}" \
      | mawk \
            -v "split1_start=${split1_start}" \
            -v "split1_end=${split1_end}" \
            -v "split2_start=${split2_start}" \
            -v "split2_end=${split2_end}" \
            -v "split3_start=${split3_start}" \
            -v "split3_end=${split3_end}" \
            -v "output_fastq_split1_filename=${output_fastq_split1_filename}" \
            '
            BEGIN {
                split1_length = split1_end - split1_start + 1;
                split2_length = split2_end - split2_start + 1;
                split3_length = split3_end - split3_start + 1;
            }
            {
                if (NR % 2 == 1) {
                    # Read name or "+" line.
                    print $0 > output_fastq_split1_filename
                } else {
                    # Sequence or quality line.
                    print substr($0, split1_start, split1_length) substr($0, split2_start, split2_length) substr($0, split3_start, split3_length) > output_fastq_split1_filename;
                }
            }'
}

In [15]:
fastq=fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.fastq.gz
newname=${fastq%.fastq.gz}.BCONLY.fastq
echo $newname
split_fastq $fastq $newname &

fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.BCONLY.fastq
[1] 11062


In [16]:
jobs

[1]+  Running                 split_fastq $fastq $newname &


In [6]:
ls fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.fastq.gz

[0m[01;36mfastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.fastq.gz[0m


In [67]:
ls fastq/*BCONLY*

[0m[01;36mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-1_S1_R2.BCONLY.fastq.gz[0m
[01;36mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-2_S2_R2.BCONLY.fastq.gz[0m
[01;36mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-3_S3_R2.BCONLY.fastq.gz[0m
[01;36mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-4_S4_R2.BCONLY.fastq.gz[0m
[01;36mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-5_S5_R2.BCONLY.fastq.gz[0m
fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.BCONLY.fastq
fastq/HYR__c47998__20210412_mouse_cortex_sb_S2_R2_001.BCONLY.fastq


In [2]:
module load seqtk

In [19]:
for fastq in fastq/*BCONLY.fastq.gz
do
    #echo $fastq
    newname=${fastq%.fastq.gz}.SUB.fastq
    echo $newname
    seqtk sample $fastq 100000 > $newname
done

fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-1_S1_R2.BCONLY.SUB.fastq.gz
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-2_S2_R2.BCONLY.SUB.fastq.gz
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-3_S3_R2.BCONLY.SUB.fastq.gz
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-4_S4_R2.BCONLY.SUB.fastq.gz
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-5_S5_R2.BCONLY.SUB.fastq.gz
fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.BCONLY.SUB.fastq.gz
fastq/HYR__c47998__20210412_mouse_cortex_sb_S2_R2_001.BCONLY.SUB.fastq.gz


now, run the barcode correction script to get a distribution of errors in barcodes

In [20]:
cat /staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit/correct_barcode_in_fastq.sh
# this was written by Gert Hulselmans

#!/bin/bash

set -eo pipefail



correct_barcode_in_fastq () {
    local bc_whitelist_filename="${1}";
    local fastq_with_raw_bc_filename="${2}";
    local fastq_with_corrected_bc_filename="${3}";
    local max_mismatches="${4:-1}";
    local min_frac_bcs_to_find="${5:-0.5}";

    if [ ${#@} -lt 3 ] ; then
        printf 'Usage: correct_barcode_in_fastq bc_whitelist_file fastq_with_raw_bc_file fastq_with_corrected_bc_file [max_mismatches] [min_frac_bcs_to_find]\n';
        return 1;
    fi

    if [ ! -e "${bc_whitelist_filename}" ] ; then
        printf 'Error: Barcode whitelist file "%s" could not be found.\n' "${bc_whitelist_filename}" >&2;
        return 1;
    fi

    if [ ! -e "${fastq_with_raw_bc_filename}" ] ; then
        printf 'Error: FASTQ file with raw barcodes "%s" could not be found.\n' "${fastq_with_raw_bc_filename}" >&2;
        return 1;
    fi

    local first_barcode='';

    # Read first barcode from barcode whitelist file.
    if [ "${bc_whitelist_filename%.gz}"

In [22]:
ls fastq/*BCONLY.SUB.fastq.gz

[0m[01;31mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-1_S1_R2.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-2_S2_R2.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-3_S3_R2.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-4_S4_R2.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-5_S5_R2.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.BCONLY.SUB.fastq.gz[0m
[01;31mfastq/HYR__c47998__20210412_mouse_cortex_sb_S2_R2_001.BCONLY.SUB.fastq.gz[0m


In [33]:
module load pigz
export PATH=$PATH:/staging/leuven/stg_00002/lcb/ghuls/software/seq/bin/

In [35]:
bc_whitelist_file=/lustre1/project/stg_00002/lcb/fderop/data/20220125_hydrop_data_analysis/resources/full_barcodes_no_underscore_REVCOMP.txt
max_mismatches=3
min_frac_bcs_to_find=0.1

for fastq_with_raw_bc_file in fastq/*BCONLY.SUB.fastq.gz
do
    echo $fastq_with_raw_bc_file
    fastq_with_corrected_bc_file=${fastq_with_raw_bc_file%.fastq.gz}.CORR.fastq.gz
    /staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit/correct_barcode_in_fastq.sh $bc_whitelist_file $fastq_with_raw_bc_file $fastq_with_corrected_bc_file $max_mismatches $min_frac_bcs_to_find &
done

fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-1_S1_R2.BCONLY.SUB.fastq.gz
[1] 3269
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-2_S2_R2.BCONLY.SUB.fastq.gz
[2] 3270
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-3_S3_R2.BCONLY.SUB.fastq.gz
[3] 3271
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-4_S4_R2.BCONLY.SUB.fastq.gz
[4] 3272
fastq/HYA__combined__20210323_cortex_phu_dv_etssb_1-5_S5_R2.BCONLY.SUB.fastq.gz
[5] 3273
fastq/HYR__666332__20210412_mouse_cortex_sa_S1_R2_001.BCONLY.SUB.fastq.gz
[6] 3274
fastq/HYR__c47998__20210412_mouse_cortex_sb_S2_R2_001.BCONLY.SUB.fastq.gz
[7] 3276


In [40]:
jobs

[6]-  Running                 /staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit/correct_barcode_in_fastq.sh $bc_whitelist_file $fastq_with_raw_bc_file $fastq_with_corrected_bc_file $max_mismatches $min_frac_bcs_to_find &
[7]+  Running                 /staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit/correct_barcode_in_fastq.sh $bc_whitelist_file $fastq_with_raw_bc_file $fastq_with_corrected_bc_file $max_mismatches $min_frac_bcs_to_find &


at higher hamming distances, barcode collisions will start to occur.

In [49]:
module load pigz/2.6-GCCcore-6.4.0

In [51]:
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit/correct_barcode_in_fastq.sh $bc_whitelist_file $fastq_with_raw_bc_file $fastq_with_corrected_bc_file $max_mismatches $min_frac_bcs_to_find &

[1] 15414
