In [2]:
import re
import os
import pysam
import subprocess
import numpy as np
import pandas as pd
from collections import OrderedDict
from pprint import pprint

## Checking for right trimmed reads

In [6]:
def getFilePath(path = '../../analysis/trimmed_seqs'):
    '''
    This function takes a path, removes the "here" path from it and returns a list containing the 
    absolute path of all 'BBDuk' output files within that path.
    '''
    # Extracting aligner from path to extract correct stats file later
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), 'err-r.txt'))
    return pathList

In [None]:
# Defining paths to stat files
trimmedPath = '../../analysis/trimmed_seqs'

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
trimmedList = getFilePath(path = trimmedPath)
valueList = []
# Looping over files and extracting summary statistics
for i in range(len(trimmedList)):
    
    # Extracting summary statistics
    trimmedSN = subprocess.check_output(['grep ^Total {} | cut -f 2-'.format(trimmedList[i])], shell=True) # Extracting summary data for whole alignment
    trimmedSN = trimmedSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    trimmedSN = trimmedSN.split('\n')
    #trimmedSN = trimmedSN.split(' ')
    del(trimmedSN[-1])
    trimmedSN = trimmedSN[0]
    valueList.append(trimmedSN[11:15])
    #for j in range(len(trimmedSN)):
    #    trimmedSN[j] = trimmedSN[j].split(' ')
    #    pprint(trimmedSN)
#print(valueList)

In [80]:
for i in range(len(valueList)):
    valueList[i] = float(valueList[i])
myarray = np.array(valueList)

## Checking for left trimmed reads

In [82]:
def getFilePath(path = '../../analysis/trimmed_seqs'):
    '''
    This function takes a path, removes the "here" path from it and returns a list containing the 
    absolute path of all 'BBDuk' output files within that path.
    '''
    # Extracting aligner from path to extract correct stats file later
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), 'err-b.txt'))
    return pathList

In [None]:
# Defining paths to stat files
trimmedPath = '../../analysis/trimmed_seqs'

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
trimmedList = getFilePath(path = trimmedPath)
valueList = []
# Looping over files and extracting summary statistics
for i in range(len(trimmedList)):
    
    # Extracting summary statistics
    # bwa mem alignment stat file summary
    trimmedSN = subprocess.check_output(['grep ^Total {} | cut -f 2-'.format(trimmedList[i])], shell=True) # Extracting summary data for whole alignment
    trimmedSN = trimmedSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    trimmedSN = trimmedSN.split('\n')
    #trimmedSN = trimmedSN.split(' ')
    del(trimmedSN[-1])
    trimmedSN = trimmedSN[0]
    valueList.append(trimmedSN[9:13])
    #for j in range(len(trimmedSN)):
    #    trimmedSN[j] = trimmedSN[j].split(' ')
    #    pprint(trimmedSN)
#print(valueList)

In [88]:
for i in range(len(valueList)):
    valueList[i] = float(valueList[i])
myarray = np.array(valueList)

## Checking for quality trimmed reads

In [28]:
def getFilePath(path = '../../analysis/trimmed_seqs'):
    '''
    This function takes a path, removes the "here" path from it and returns a list containing the 
    absolute path of all 'BBDuk' output files within that path.
    '''
    # Extracting aligner from path to extract correct stats file later
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), 'err-c.txt'))
    return pathList

In [None]:
# Defining paths to stat files
trimmedPath = '../../analysis/trimmed_seqs'

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
trimmedList = getFilePath(path = trimmedPath)
valueList = []
# Looping over files and extracting summary statistics
for i in range(len(trimmedList)):
    
    # Extracting summary statistics
    # bwa mem alignment stat file summary
    trimmedSN = subprocess.check_output(['grep ^Total {} | cut -f 2-'.format(trimmedList[i])], shell=True) # Extracting summary data for whole alignment
    trimmedSN = trimmedSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    trimmedSN = trimmedSN.split('\n')
    #trimmedSN = trimmedSN.split(' ')
    del(trimmedSN[-1])
    trimmedSN = trimmedSN[0]
    pprint(trimmedSN)
    valueList.append(trimmedSN[14:18])
    #for j in range(len(trimmedSN)):
    #    trimmedSN[j] = trimmedSN[j].split(' ')
    #    pprint(trimmedSN)
    """
    # Extracting specific values 
    bwaValArray = np.array([])
    for i in range(len(bwaSN)):
        bwaValArray = np.append(bwaValArray, bwaSN[i][1])
    
    # Filling in the array
    """

In [68]:
for i in range(len(valueList)):
    valueList[i] = float(valueList[i])
myarray = np.array(valueList)

In [104]:
def BBDukDic(file_path):
    '''
    This function takes the .txt file created by the 'stats' argument of BBDuk and creates an ordered dictionary
    containing the following items in the here displayed order. The values are lists:
    'File':[input file(s)]
    'Total':[Combined number of reads in all input files]
    'Matched':[Total number of matched reads by BBDuk filer, Percentage of total matched reads]
    'Adapter_name':[Total number of matched reads by BBDuk filter, Percentage of total matched reads]
    
    The entries for adapters appear only if the filtered sequence was trimmed using the BBDuk arguments
    'ktrim' in combination with a reference fasta file referred to by the argument 'ref'.
    
    This function is only tentative and will most certainly not catch all output of BBDuk, when executed with
    here unused arguments, in a proper way.
    '''
    dic = OrderedDict()
    for line in open(f_path):
        line = line[1:] # Truncating the leading '#'
        line = re.sub('[%#]', '', line) # Removing tailing '%'
        raw = line.split() # Splitting line at all whitespaces into a list of words
        dic[raw[0]] = raw[1:] # Every keyword in one line followed by a list of values it represents
    del dic['Name']
    return dic