In [None]:
# Imports
import pysam
import subprocess
import numpy as np
import pandas as pd

In [1]:
def index_bam(bam_file):
    # Constructing the command as a string to build bam file index
    cmd = "samtools index " + bam_file
    # Running the command using subprocess
    subprocess.run(cmd, shell=True, check=True)

In [3]:
def compute_average_read_quality(bam_file):
     # Open the BAM file for reading ('rb' stands for read binary)
    bam = pysam.AlignmentFile(bam_file, "rb")
    # Initialize an empty list to store read qualities
    read_qualities = []
    # Iterate over each read in the BAM file
    for read in bam:
        # Extend the list with the quality scores of the read, excluding the first 10 and last 10 bases
        read_qualities.extend(read.query_qualities[10:-10])
    # Close the BAM file after reading
    bam.close()
    # Compute the average of the read qualities using numpy
    average_quality = np.mean(read_qualities)
    # Return the computed average read quality
    return average_quality

In [4]:
def compute_coverage_at_least_50(bam_file):
    # Set the minimum coverage
    min_coverage = 50
    # Open the BAM file for reading ('rb' stands for read binary)
    bam = pysam.AlignmentFile(bam_file, "rb")
    # Initialize counters for total positions in the genome and those covered by reads
    total_positions = 0
    covered_positions = 0
    # Iterate over each position in the reference genome
    for pileupcolumn in bam.pileup():
        # Increment the total position counter
        total_positions += 1
        # If the number of reads covering the position is greater than or equal to the minimum coverage
        if pileupcolumn.n >= min_coverage:
            # Increment the covered position counter
            covered_positions += 1
    # Close the BAM file after processing
    bam.close()
    # Calculate the percentage of positions that meet the minimum coverage requirement
    coverage_percentage = (covered_positions / total_positions) * 100
    # Return the percentage of positions that meet the minimum coverage requirement
    return coverage_percentage

In [5]:
def compute_average_mapping_quality(bam_file):
    # Open the BAM file for reading ('rb' stands for read binary)
    bam = pysam.AlignmentFile(bam_file, "rb")
    # Extract the mapping quality for each read in the BAM file
    mapping_qualities = [read.mapping_quality for read in bam]
    # Close the BAM file after reading
    bam.close()
    # Compute the average of the mapping qualities using numpy
    average_mapping_quality = np.mean(mapping_qualities)
    # Return the computed average mapping quality
    return average_mapping_quality

In [6]:
def compute_average_depth_coverage(bam_file):
    # Construct the command as a string
    cmd = "samtools depth " + bam_file
    # Use samtools to compute the depth of the BAM file and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True, check=True, shell=True)
    # Convert the output directly into a DataFrame
    df = pd.DataFrame([line.split('\t') for line in result.stdout.strip().split('\n')], columns=['Ref', 'Pos', 'Depth'])
    # Convert the 'Depth' column to integer type and compute the mean depth
    return df['Depth'].astype(int).mean()

In [7]:
# Define the list of samples
samples = ["04.B1.W14.01", "04.M1.W09.02", 
           "05.B1.W14.04", "05.M1.W08.03",
           "27.B1.W13.06", "27.M1.W10.07", 
           "30.B1.W11.08", "30.M1.W04.09", 
           "38.B1.W10.11", "38.M1.W03.10", 
           "39.B1.W11.12", "39.M1.W03.13", "39.M1.W05.14", 
           "53.B1.W14.17", "53.M1.W07.16", 
           "56.B1.W09.22", "56.M1.W03.21", 
           "63.B1.W09.29", "63.M1.W02.30", 
           "66.B1.W09.25", "66.M1.W02.24"]

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Sample', 'Average read quality', 'Coverage percentage (>50)',
                                   'Average mapping quality', 'Average depth coverage'])

# Iterate on samples to extract read informations
for sample in samples:
    bam_file = "data/" + sample + "/" + sample + ".bam"
    index_bam(bam_file)
    average_read_quality = compute_average_read_quality(bam_file)
    coverage_at_least_50 = compute_coverage_at_least_50(bam_file)
    average_mapping_quality = compute_average_mapping_quality(bam_file)
    average_depth_coverage = compute_average_depth_coverage(bam_file)

    # Create a new DataFrame with the results for the current sample
    sample_df = pd.DataFrame({
        'Sample': [sample],
        'Average read quality': [round(average_read_quality, 2)],
        'Coverage percentage (>50)': [round(coverage_at_least_50, 2)],
        'Average mapping quality': [round(average_mapping_quality, 2)],
        'Average depth coverage': [round(average_depth_coverage, 2)]})

    # Concatenate the new DataFrame with the existing results DataFrame
    results_df = pd.concat([results_df, sample_df], ignore_index=True)

In [8]:
# Display the quality of sequencing data
results_df

Unnamed: 0,Sample,Average read quality,Coverage percentage (>50),Average mapping quality,Average depth coverage
0,04.B1.W14.01,22.02,99.04,57.46,14303.89
1,04.M1.W09.02,22.12,99.18,56.8,32548.05
2,05.B1.W14.04,21.94,99.32,58.7,11911.52
3,05.M1.W08.03,21.87,99.11,58.54,19751.16
4,27.B1.W13.06,22.34,98.64,55.58,26189.03
5,27.M1.W10.07,22.16,99.11,56.09,14538.82
6,30.B1.W11.08,21.97,99.11,57.81,12580.72
7,30.M1.W04.09,22.04,99.25,58.02,11404.77
8,38.B1.W10.11,22.25,99.25,56.34,21268.29
9,38.M1.W03.10,22.26,99.32,56.93,13532.5
