# Tag Reads

This script links each read for person0 and person1 with the presence or absence of STRs.

In [169]:
print('')
print('READ TAGGING')

[1] ""
[1] "READ TAGGING"


# Load Modules

In [170]:
print('  loading libraries')

#################################################################################
# Libraries
#################################################################################
library(Rsamtools, verbose = FALSE)
library(dplyr)
library(ggplot2)

[1] "  loading libraries"


# Set Parameters

In [171]:
print('  setting parameters')

run_number = 'run1'
chr = 'chr11'
datadir = paste0('/mnt/aretian/genomics/nanopore/',run_number)

P0bam <- paste0(datadir,'/strspy/input/',run_number,'_', chr,'_','person0.bam')
P0ind <- paste0(datadir,'/strspy/input/',run_number,'_', chr,'_','person0.bam.bai')
P1bam <- paste0(datadir,'/strspy/input/',run_number,'_', chr,'_','person1.bam')
P1ind <- paste0(datadir,'/strspy/input/',run_number,'_', chr,'_','person1.bam.bai')

Person0Bam <- BamFile(file = P0bam, index = P0ind)
Person1Bam <- BamFile(file = P1bam, index = P1ind)
samples <- c(Person0Bam, Person1Bam)

[1] "  setting parameters"


# Load data

In [172]:
print('  loading data')

vcf_filepath <- paste0(datadir,'/',run_number,'_',chr,'_person_full.txt')
vcf <- read.table(vcf_filepath, sep = '\t', stringsAsFactors = FALSE)
# x <- readLines(vcf_filepath)

# Set column names
names <- c('name', 'count', 'chr', 'start', 'end', 'motif','str')
colnames(vcf) <- names
vcf$strname <- paste0('str', seq(1:nrow(vcf)))

[1] "  loading data"


# Helper Functions

In [173]:
get_samplename <- function(sample){
    samplename <- unlist(strsplit(sample$`.->path`, '/'))[grep('.bam', unlist(strsplit(sample$`.->path`, '/')))]
    samplename <- unlist(strsplit(samplename, '\\.'))[1] # changed to reflect current bam paths
    return(samplename)
}

# init_reads: This function creates a dataframe with all reads that overlap a given str
# inputs: str: str row number in the vcf, sample, vcf
# output: dataframe with reads attached to a given str
init_reads <- function(str, sample, vcf) {
    
    # get sample name
    samplename <- get_samplename(sample)
    
    str_chr <- vcf[str, 'chr']
    str_start <- vcf[str, 'start']
    str_end <- vcf[str, 'end']
    str_id <- vcf[str, 'name']
    str_pattern <- vcf[str, 'str']
    motif <- vcf[str, 'motif']

    # isolate reads covering that area 
    gr <- GRanges(seqnames = chr,
                  ranges = IRanges(start = str_start, end = str_end))
    params <- ScanBamParam(which = gr, what = scanBamWhat())
    aln <- scanBam(sample, param = params)
    
    # handle cases with 0 reads
    if (length(aln[[1]]$pos) == 0){
        reads <- NA
    } else {
        reads <- data.frame('read_id'=aln[[1]]$qname,
                            'str_id' = str_id,
                            'samplename' = samplename,
                            'startpos' = aln[[1]]$pos, 
                            'str_start'= 0,
                            'str_end' = 0,
                            'read' = aln[[1]]$seq,
                            'str' = str_pattern,
                            'motif' = motif,
                            'has_str' = 0)

        reads$str_start <- abs(reads$startpos - str_start)
        reads$str_end <- abs(reads$startpos - str_end)
    }
    
    return(reads)
}

# update reads function updates reads to include str sequences and read categorizations
update_reads <- function(reads, vcf, str) {

    # for each row (read) in reads
    for (read in c(1:nrow(reads))) {
            
        # Find STR in the reads
        if (grepl(reads[read,'motif'], reads[read,'read'])) {
            reads[read,'has_str'] <- 1
        }
    }

    return(reads)
}

# Implementation

In [174]:
# for each STR
for (str in c(1:nrow(vcf))) { #
                                 
    data <- as_tibble(data.frame('read_id'='',
                                 'str_id' = '',
                                 'samplename' = '',
                                 'startpos' = 0, 
                                 'str_start'= 0,
                                 'str_end' = 0,
                                 'read' = '',
                                 'str' = '',
                                 'motif' = '',
                                 'has_str' = 0))
    data <- data[-1,]
    
    # for each sample
    for (sample in samples) { # samples = c(Person0Bam, Person1Bam)

        # Get all reads that cover the VCF
        reads <- init_reads(str = str, sample = sample, vcf = vcf)
        if (is.null(nrow(reads))) {
#             meets_threshold <- FALSE
            break
        }
        
        # Get reads that contain the STR pattern
        reads <- update_reads(reads = reads, vcf = vcf, str = str)
        
        # remove reads without motif
        reads <- reads[reads$has_str == 1,]
        
        # update summary dataframe
        data <- rbind(data, reads)
    }

#     data <- data %>% mutate(uid = paste0(sample, '-', readgroup))
    data <- data %>% mutate(read_id = paste0(samplename, '-', read_id))
    data <- data %>% select(read_id, str_id, startpos)

    
    # save each str df (only if all samples met the threshold)
#     if (meets_threshold) {
#        write.csv(data, paste0('/mnt/aretian/genomics/data/str_pipeline/output/', disease, '-test/', fid, '/str', str, '.csv'), row.names = FALSE)
#     }
    write.csv(data, paste0(datadir,'/strs/str', str, '.csv'), row.names = FALSE)

    # time / str tracker:
    print(paste0('str', str, ': ', Sys.time()))
}

[1] "str1: 2021-07-15 01:35:53"
[1] "str2: 2021-07-15 01:35:53"
[1] "str3: 2021-07-15 01:35:53"
[1] "str4: 2021-07-15 01:35:53"
[1] "str5: 2021-07-15 01:35:53"
[1] "str6: 2021-07-15 01:35:54"
[1] "str7: 2021-07-15 01:35:54"
[1] "str8: 2021-07-15 01:35:54"
[1] "str9: 2021-07-15 01:35:54"
[1] "str10: 2021-07-15 01:35:54"
[1] "str11: 2021-07-15 01:35:54"
[1] "str12: 2021-07-15 01:35:55"
[1] "str13: 2021-07-15 01:35:55"
[1] "str14: 2021-07-15 01:35:55"
[1] "str15: 2021-07-15 01:35:55"
[1] "str16: 2021-07-15 01:35:55"
[1] "str17: 2021-07-15 01:35:55"
[1] "str18: 2021-07-15 01:35:55"
[1] "str19: 2021-07-15 01:35:55"
[1] "str20: 2021-07-15 01:35:55"
[1] "str21: 2021-07-15 01:35:55"
[1] "str22: 2021-07-15 01:35:55"
[1] "str23: 2021-07-15 01:35:55"
[1] "str24: 2021-07-15 01:35:55"
[1] "str25: 2021-07-15 01:35:56"
[1] "str26: 2021-07-15 01:35:56"
[1] "str27: 2021-07-15 01:35:56"
[1] "str28: 2021-07-15 01:35:56"
[1] "str29: 2021-07-15 01:35:56"
[1] "str30: 2021-07-15 01:35:56"
[1] "str31: 2021-07

# Concatenate all results

In [175]:
input = paste0(datadir,'/strs/str[[:digit:]]*.csv')
output = paste0(datadir,'/',run_number,'_',chr,'_tagged_reads.csv')

mycmd = paste0("awk 'FNR==1 && NR!=1{next;}{print}' ", input, ' > ', output)
# system(mycmd, intern=TRUE)
system(mycmd)

# EXTRA CODE

In [4]:
print('  loading data')

run_number = 'run1'
chr = 'chr11'
datadir = '/mnt/aretian/genomics/nanopore/'

# regions = data.frame('disease' = c('sickle', 'cystic', 'spinal1', 'spinal2', 'thal1', 'thal2', 'thal3'),
#                     'start' = c(5227002, 117559590, 70924941, 70049523, 176680, 172876, 5225464),
#                     'end' = c(5227002, 117559590, 70953015, 70077595, 177522, 173710, 5227071))

# args = commandArgs(trailingOnly=TRUE)

# THESE ARGS COME FROM STEP2.sh they need to change
#fid = args[1] #
# fid = 'IBS001'
# print(paste0('family id was read in as: ', fid))#
# chr = 17 #args[2] # keep this
# print(paste0('chr was read in as: ', chr))
# d = 'spinal1' #args[3] # keep this
# print(paste0('disease was read in as: ', d))

# s = regions %>% filter(disease == d) %>% select(start) %>% pull
# e = regions %>% filter(disease == d) %>% select(end) %>% pull

# fbam <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/father.', chr, '.bam') 
# find <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/father.', chr, '.bam.bai')
# mbam <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/mother.', chr, '.bam')
# mind <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/mother.', chr, '.bam.bai')
# cbam <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/child.', chr, '.bam')
# cind <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/bams/child.', chr, '.bam.bai')

P0bam <- paste0(datadir,run_number,'/strspy/input/',run_number,'_', chr,'_','person0.bam')
P0ind <- paste0(datadir,run_number,'/strspy/input/',run_number,'_', chr,'_','person0.bam.bai')
P1bam <- paste0(datadir,run_number,'/strspy/input/',run_number,'_', chr,'_','person1.bam')
P1ind <- paste0(datadir,run_number,'/strspy/input/',run_number,'_', chr,'_','person1.bam.bai')

# fatherBam <- BamFile(file = fbam, index = find)
# motherBam <- BamFile(file = mbam, index = mind)
# childBam <- BamFile(file = cbam, index = cind)
Person0Bam <- BamFile(file = P0bam, index = P0ind)
Person1Bam <- BamFile(file = P1bam, index = P1ind)

# vcf_filepath <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/IBS001/hipstr/IBS001.chr5.vcf') # OLD
# vcf_filepath <- paste0('/mnt/aretian/genomics/data/str_pipeline/data/', fid, '/hipstr/', fid, '.', chr, '.vcf')
person = 'person0'
vcf_filepath <- paste0('/mnt/aretian/genomics/nanopore/',run_number,'/',run_number,'_',chr,'_person_full.txt')

vcf <- read.table(vcf_filepath, sep = '\t', stringsAsFactors = FALSE)

# Get the right column names for the vcf dataframe
x <- readLines(vcf_filepath)
# colnames <- which(grepl('^#CHROM', x))
# names <- unlist(strsplit(x[colnames], '\t'))
# names[1] <- 'CHROM'
# names[2] <- 'START'
names <- c('name', 'count', 'chr', 'start', 'end', 'motif','str')
colnames(vcf) <- names

# disease <- d
# chr <- chr
# start <- s - 10000000
# end <- e + 10000000

vcf$strname <- paste0('str', seq(1:nrow(vcf)))

#################################################################################
# Set sample and disease specific variables 
#################################################################################
# read_threshold <- 30
samples <- c(Person0Bam, Person1Bam)


In [50]:
print('  creating functions')
#################################################################################
# Functions for VCF Filtering: 
# remove homopolymers, define end points, and isolate top 50 most frequent STRs
#################################################################################

# helper function for filter_VCF; remove homopolymers
pull_period <- function(x){
    y <- unlist(strsplit(x, ';'))[grep('PERIOD', unlist(strsplit(x, ';')))]
    y <- unlist(strsplit(y, '='))[2]
    return(y)
}

# helper function for filter_VCF; add end points
pull_end <- function(x) {
    y <- unlist(strsplit(x, ';'))[grep('END', unlist(strsplit(x, ';')))]
    y <- unlist(strsplit(y, '='))[2]
    return(y)
}

# vcf = vcf file, n = number of top most frequent STRs, startrange = start of range of interest, endrange = end of range of interest
filter_VCF <- function(vcf, n, startrange, endrange) {

    # remove homopolymers
    vcf$period <- lapply(vcf$INFO, pull_period)
    vcf <- vcf[vcf$period != '1',]
    
    # add end points
    vcf$END <- as.numeric(lapply(vcf$INFO, pull_end))
    
    # filter by range of interest
    vcf <- vcf[(vcf$START > startrange) & (vcf$END < endrange),]
    
    # define top n most frequent reference sequences
    freqs <- as.data.frame(table(vcf$REF))
    freqs <- freqs[order(freqs$Freq, decreasing = TRUE),]
    freqs <- head(freqs, n = n)

    # filter vcf to reflect top n most frequent reference sequences
    vcf <- vcf[vcf$REF %in% freqs$Var1,]
    
    # return filtered vcf
    return(vcf)
}

[1] "  creating functions"


In [82]:
print('  processing STRs')
#################################################################################
# IMPLEMENTATION
# input = VCF file and BAM files. VCF is only to be used for determining reference sequence locations. 
#################################################################################

# filter vcf
vcf <- filter_VCF(vcf = vcf, n = 50, startrange = start, endrange = end)

#######################################################################################
# rename and save STRs
vcf$strname <- paste0('str', seq(1:nrow(vcf))) 
vcf$period <- as.integer(vcf$period)
# write.csv(vcf, paste0('/mnt/aretian/genomics/data/str_pipeline/output/', disease, '/', fid, '/str_mappings.csv'), row.names = FALSE) # UNCOMMENT
# write.csv(vcf, paste0('/mnt/aretian/genomics/data/str_pipeline/output/', disease, '-test/', fid, '/str_mappings.csv'), row.names = FALSE) # UNCOMMENT

#######################################################################################

[1] "  processing STRs"


ERROR: Error in `$<-.data.frame`(`*tmp*`, "period", value = list()): replacement has 0 rows, data has 1367
