# Extracting Data from the stats files of the alignment of all 192 samples

## Import all necessary modules

In [1]:
import re
import os
import subprocess
import numpy as np
import pandas as pd
from pprint import pprint
import numpy as np
import math

## Defining the function 'getFilePath' to get the paths of all desired files

In [2]:
def getFilePath(path = '../../analysis/alignment_data_GWAS'):
    '''
    This function takes a path, removes the "here" path ("./." in linux) from it and returns a list containing the 
    absolute path of all 'samtools stats' output files within that path.
    '''
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), file[2]))
    return pathList

## Creation and filling of data frame for original BAM files by looping over all stat files and extracting indices and sample names

In [3]:
#############################
#                           #
# Creating empty data frame #
#                           #
#############################

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
fileList = getFilePath(path = '../../analysis/alignment_data_GWAS')
# Initialize sample index list
sampleList = []
# Initialize column name list
nameArray = []

# Looping over files and extracting summary statistics
for file in fileList:
    # Extraction of sample name as for checking data frame entry validity
    sampleList.append(file.split('/')[-2])
    
    # Extracting summary statistics for each sample
    #SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = SN.decode('utf-8') # Decoding byte string into UTF-8 character string
    SN = SN.split('\n')
    for i in range(len(SN)):
        SN[i] = SN[i].split('\t')
    del(SN[-1])
    
    # Filling column name list (happens only in first loop)
    if len(nameArray)==0:
        for i in range(len(SN)):
            # Extracting column names (also deleting leading and tailing whitespaces and replacing spaces with underscores)
            nameArray.append(SN[i][0])
            nameArray[i] = re.sub(r"[^\w\s]", '', nameArray[i])
            nameArray[i] = nameArray[i].strip()
            nameArray[i] = re.sub(r"\s+", '_', nameArray[i])

# Creation of empty data frame
    statFrame = pd.DataFrame(index=sampleList, columns=nameArray)


##########################
#                        #
# Filling the data frame #
#                        #
##########################

# Defining path to stat files
bamPath = '../../analysis/alignment_data_GWAS'

# Creating a list of absolute paths with 192 entries
bamList = getFilePath(path = bamPath)

# Looping over files and extracting summary statistics
for i in range(len(bamList)):
    # Extraction of sample name as index identifier for filling in values
    sample = bamList[i].split('/')[-2]
    
    # Extracting summary statistics
    # bam mem alignment stat file summary
    bamSN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(bamList[i])], shell=True) # Extracting summary data for whole alignment
    bamSN = bamSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    bamSN = bamSN.split('\n')
    
    for i in range(len(bamSN)):
        bamSN[i] = bamSN[i].split('\t')
    del(bamSN[-1])
    
    # Extracting specific values 
    bamValArray = np.array([])
    for i in range(len(bamSN)):
        bamValArray = np.append(bamValArray, bamSN[i][1])
    
    # Filling in data frame
    statFrame.loc[sample] = bamValArray


#########################
#                       #
# Saving the data frame #
#                       #
#########################

# Save data frame to csv file for plotting in R    
outPath = os.path.abspath('../../analysis/alignment_data_GWAS/summary_table.csv')
statFrame.to_csv(outPath)

## Creation and filling of data frame for duplicate marked BAM files by looping over all stat files and extracting indices and sample names

In [13]:
##############################################################################
#                                                                            #
# Redefining the stats path function for catching the right file in the path #
#                                                                            #
##############################################################################

def getFilePath(path = '../../analysis/alignment_data_GWAS'):
    '''
    This function takes a path, removes the "here" path ("./." in linux) from it and returns a list containing the 
    absolute path of all 'samtools stats' output files within that path.
    '''
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), file[5]))
    return pathList


#############################
#                           #
# Creating empty data frame #
#                           #
#############################

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
fileList = getFilePath(path = '../../analysis/alignment_data_markeddup')
# Initialize sample index list
sampleList = []
# Initialize column name list
nameArray = []

# Looping over files and extracting summary statistics
for file in fileList:
    # Extraction of sample name as for checking data frame entry validity
    sampleList.append(file.split('/')[-2])
    
    # Extracting summary statistics for each sample
    #SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = SN.decode('utf-8') # Decoding byte string into UTF-8 character string
    SN = SN.split('\n')
    for i in range(len(SN)):
        SN[i] = SN[i].split('\t')
    del(SN[-1])
    
    # Filling column name list (happens only in first loop)
    if len(nameArray)==0:
        for i in range(len(SN)):
            # Extracting column names (also deleting leading and tailing whitespaces and replacing spaces with underscores)
            nameArray.append(SN[i][0])
            nameArray[i] = re.sub(r"[^\w\s]", '', nameArray[i])
            nameArray[i] = nameArray[i].strip()
            nameArray[i] = re.sub(r"\s+", '_', nameArray[i])

# Creation of empty data frame
    statFrame = pd.DataFrame(index=sampleList, columns=nameArray)


##########################
#                        #
# Filling the data frame #
#                        #
##########################

# Defining path to stat files
bamPath = '../../analysis/alignment_data_markeddup'

# Creating a list of absolute paths with 192 entries
bamList = getFilePath(path = bamPath)

# Looping over files and extracting summary statistics
for i in range(len(bamList)):
    # Extraction of sample name as index identifier for filling in values
    sample = bamList[i].split('/')[-2]
    
    # Extracting summary statistics
    # bam mem alignment stat file summary
    bamSN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(bamList[i])], shell=True) # Extracting summary data for whole alignment
    bamSN = bamSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    bamSN = bamSN.split('\n')
    
    for i in range(len(bamSN)):
        bamSN[i] = bamSN[i].split('\t')
    del(bamSN[-1])
    
    # Extracting specific values 
    bamValArray = np.array([])
    for i in range(len(bamSN)):
        bamValArray = np.append(bamValArray, bamSN[i][1])
    
    # Filling in data frame
    statFrame.loc[sample] = bamValArray


#########################
#                       #
# Saving the data frame #
#                       #
#########################

# Save data frame to csv file for plotting in R    
outPath = os.path.abspath('../../analysis/alignment_data_markeddup/summary_table_markeddup.csv')
statFrame.to_csv(outPath)

## Creation and filling of data frame for duplicate removed BAM files by looping over all stat files and extracting indices and sample names

In [14]:
##############################################################################
#                                                                            #
# Redefining the stats path function for catching the right file in the path #
#                                                                            #
##############################################################################

def getFilePath(path = '../../analysis/alignment_data_GWAS'):
    '''
    This function takes a path, removes the "here" path ("./." in linux) from it and returns a list containing the 
    absolute path of all 'samtools stats' output files within that path.
    '''
    pathList = []
    for root, subfolder, file in os.walk(path):
        # Excluding the "here" path ('.'). It is important to not execute the script from a path with
        # different relative distance to the target path to not fail expected path depth
        if root.count(os.sep) == 4:
            # Concatenating the sample paths and the samtools stats output file
            pathList.append(os.path.join(os.path.abspath(root), file[6]))
    return pathList


#############################
#                           #
# Creating empty data frame #
#                           #
#############################

# Creating a list of absolute paths with 50 entries, one for each sample stat output file
fileList = getFilePath(path = '../../analysis/alignment_data_markeddup')
# Initialize sample index list
sampleList = []
# Initialize column name list
nameArray = []

# Looping over files and extracting summary statistics
for file in fileList:
    # Extraction of sample name as for checking data frame entry validity
    sampleList.append(file.split('/')[-2])
    
    # Extracting summary statistics for each sample
    #SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(file)], shell=True) # Extracting summary data for whole alignment
    SN = SN.decode('utf-8') # Decoding byte string into UTF-8 character string
    SN = SN.split('\n')
    for i in range(len(SN)):
        SN[i] = SN[i].split('\t')
    del(SN[-1])
    
    # Filling column name list (happens only in first loop)
    if len(nameArray)==0:
        for i in range(len(SN)):
            # Extracting column names (also deleting leading and tailing whitespaces and replacing spaces with underscores)
            nameArray.append(SN[i][0])
            nameArray[i] = re.sub(r"[^\w\s]", '', nameArray[i])
            nameArray[i] = nameArray[i].strip()
            nameArray[i] = re.sub(r"\s+", '_', nameArray[i])

# Creation of empty data frame
    statFrame = pd.DataFrame(index=sampleList, columns=nameArray)


##########################
#                        #
# Filling the data frame #
#                        #
##########################

# Defining path to stat files
bamPath = '../../analysis/alignment_data_markeddup'

# Creating a list of absolute paths with 192 entries
bamList = getFilePath(path = bamPath)

# Looping over files and extracting summary statistics
for i in range(len(bamList)):
    # Extraction of sample name as index identifier for filling in values
    sample = bamList[i].split('/')[-2]
    
    # Extracting summary statistics
    # bam mem alignment stat file summary
    bamSN = subprocess.check_output(['grep ^SN {} | cut -f 2-'.format(bamList[i])], shell=True) # Extracting summary data for whole alignment
    bamSN = bamSN.decode('utf-8') # Decoding byte string into UTF-8 character string
    bamSN = bamSN.split('\n')
    
    for i in range(len(bamSN)):
        bamSN[i] = bamSN[i].split('\t')
    del(bamSN[-1])
    
    # Extracting specific values 
    bamValArray = np.array([])
    for i in range(len(bamSN)):
        bamValArray = np.append(bamValArray, bamSN[i][1])
    
    # Filling in data frame
    statFrame.loc[sample] = bamValArray


#########################
#                       #
# Saving the data frame #
#                       #
#########################

# Save data frame to csv file for plotting in R    
outPath = os.path.abspath('../../analysis/alignment_data_markeddup/summary_table_removeddup.csv')
statFrame.to_csv(outPath)

## Defining the function 'samStatsDataFrame' to create a pandas Data Frame

In [97]:
# Function, to create a pandas Data Frame from the SN data of the'samtool stats' 
def samStatsDataFrame(path = '../../analysis/alignment_data/bwa-mem'):
    '''
    Function for creating a pandas Data Frame
    '''
    

## Extracting coverage data produced with MultiQC

In [44]:
path = '../../analysis/raw_seqs/multiqc_data/multiqc_general_stats.txt'
absPath = os.path.abspath(path)
print(absPath)

/media/rna/NEOPHOCA/GWAS/analysis/raw_seqs/multiqc_data/multiqc_general_stats.txt


In [46]:
covFrame = pd.read_csv(absPath, delimiter='\t')
#covFrame = covFrame.iloc[::2]

In [None]:
list(covFrame.columns)

In [67]:
covFrameSorted = covFrame.sort_values(by=[list(covFrame.columns)[4], list(covFrame.columns)[3]], ascending=True)

In [68]:
covFrameSorted = covFrameSorted.drop(columns = ['FastQC_mqc-generalstats-fastqc-avg_sequence_length'])

In [None]:
covFrameSorted

In [None]:
np.mean(covFrameSorted['FastQC_mqc-generalstats-fastqc-total_sequences'])