In [2]:
import re
import os
import pysam
import subprocess
import numpy as np
import pandas as pd
from pprint import pprint

## Defining the function 'getFilePath' to get the paths of all desired files

In [16]:
def getFilePath(path = '../../analysis/alignment_data_markeddup'):
    '''
    This function takes a path, removes the "here" path from it and returns a list containing the 
    absolute path of all 'samtools stats' output files within that path.
    '''
    # Extracting aligner from path to extract correct stats file later
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            sample = os.path.abspath(root).split('/')[-1]
            pathList.append(os.path.join(os.path.abspath(root), sample + '_stats_marked.out'))
    return pathList

## Creation of empty data frame by looping over all stat files and extracting indices and sample names

In [17]:
# Creating a list of absolute paths with 50 entries, one for each sample stat output file
fileList = getFilePath(path = '../../analysis/alignment_data_markeddup')
# Initialize sample index list
sampleList = []
# Initialize column name list
nameArray = []

# Looping over files and extracting summary statistics
for file in fileList:
    # Extraction of sample name as for checking data frame entry validity
    sampleList.append(file.split('/')[-2])
    
    # Extracting summary statistics
    SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = SN.decode('utf-8') # Decoding byte string into UTF-8 character string
    SN = SN.split('\n')
    for i in range(len(SN)):
        SN[i] = SN[i].split('\t')
    del(SN[-1])
    
    # Filling column name list (happens only in first loop)
    if len(nameArray)==0:
        for i in range(len(SN)):
            # Extracting column names (also deleting leading and tailing whitespaces and replacing spaces with underscores)
            nameArray.append(SN[i][0])
            nameArray[i] = re.sub(r"[^\w\s]", '', nameArray[i])
            nameArray[i] = nameArray[i].strip()
            nameArray[i] = re.sub(r"\s+", '_', nameArray[i])

# Creation of empty data frame
    statFrame = pd.DataFrame(index=sampleList, columns=nameArray)

## Fill the data frame with a loop

In [20]:
# Defining paths to stat files
bwaPath = '../../analysis/alignment_data_markeddup'

# Creating a list of absolute paths with 192 entries, one for each sample stat output file
bwaList = getFilePath(path = bwaPath)

# Looping over files and extracting summary statistics
for i in range(len(bwaList)):
    # Extraction of sample name as index identifier for filling in values
    bwaSample = bwaList[i].split('/')[-2]
    
    # Extracting summary statistics
    # bwa mem alignment stat file summary
    bwaSN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(bwaList[i])], shell=True) # Extracting summary data for whole alignment
    bwaSN = bwaSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    bwaSN = bwaSN.split('\n')  
    
    for i in range(len(bwaSN)):
        bwaSN[i] = bwaSN[i].split('\t')
    del(bwaSN[-1])
    
    # Extracting specific values 
    bwaValArray = np.array([])
    for i in range(len(bwaSN)):
        bwaValArray = np.append(bwaValArray, bwaSN[i][1])
    
    # Filling in data frame
    statFrame.loc[bwaSample] = bwaValArray

# Save data frame to csv file for plotting in R

In [57]:
outPath = os.path.abspath('../../analysis/alignment_data_markeddup/summary_table.csv')
statFrame.to_csv(outPath)