In [7]:
import pandas as pd
import tabula
import re
import os
from glob import glob
from datetime import datetime

In [8]:
# Get the current date and time for output file naming
current_datetime = datetime.now().strftime("%Y-%m-%d-%H-%M")

In [9]:
# Function to extract run number from run folder
def run_number(folder_name):
    match = re.search(r'HRD_Run(\d+)', folder_name)
    if match:
        return int(match.group(1)) 
    return -1 

In [10]:
# Function to extract GI QA table from GI-report.pdf
def extract_QA_table_from_GI_report(file_path):
    print(f"Extracting GI QA table from {file_path}")

    # Read pdf pages and areas [top, left, bottom, right]
    gi_qa_table = pd.DataFrame()
    pages = [5]
    area = [13, 0, 57, 100]

    table = tabula.read_pdf(file_path, 
                            pages=pages, 
                            multiple_tables=False, 
                            area=area, 
                            relative_area=True, 
                            stream=True, 
                            guess=False, 
                            pandas_options={'header': None}
                            )

    if table:
        gi_qa_table= table[0]
        return(gi_qa_table)
    else:
        print('Could not find')
        return None

In [11]:
# Function to traverse run folders getting GI QA tables from GI-Reports and combine
def traverse_folders(root_folder):
    folder_names = [foldername for foldername in os.listdir(root_folder) if foldername.startswith("HRD_Run")]
    folder_names = sorted(folder_names, key=run_number)
    combined_gi_qa = None
    gi_qa_headers = ['Sample','Total(M)', 'WGS(M)', "%WGS", "%CovOut", "PPR", 'RN', 'SNR', 'QAStatus']

    for foldername in folder_names:
        folder_path = os.path.join(root_folder, foldername)
        print(f"Checking folder: {folder_path}")

        for filename in os.listdir(folder_path):
            if filename.endswith("-GI-Report.pdf"):
                file_path = os.path.join(folder_path, filename)

                if os.path.isfile(file_path):
                    # Get GI QA table and add column headings
                    extracted_gi_qa = extract_QA_table_from_GI_report(file_path)
                    extracted_gi_qa = extracted_gi_qa.iloc[3:]
                    extracted_gi_qa.columns = gi_qa_headers

                    if extracted_gi_qa is not None:
                                    # Combine GI QA tables
                                    if combined_gi_qa is None:
                                        combined_gi_qa = extracted_gi_qa
                                    else:
                                        combined_gi_qa = pd.concat([combined_gi_qa, extracted_gi_qa], axis=0)

                    if combined_gi_qa is not None:
                        # Save combined mapping stats to csv with datetime stamp
                        combined_gi_qa.to_csv(output_folder + current_datetime+'_combined_GI_QA_table.csv', index=False)
                        print(combined_gi_qa.shape)
                    else:
                        print(f"GI QA table extraction unsuccessful for {folder_path}")

In [12]:
# Call the function
if __name__ == "__main__":
    root_folder = "./Resources/SOPHiA HRD"
    output_folder = "./Resources/"
    traverse_folders(root_folder)

Checking folder: ./Resources/SOPHiA HRD/HRD_Run1
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run1/HRD_202307070141-21752-0086-GI-Report.pdf
(23, 9)
Checking folder: ./Resources/SOPHiA HRD/HRD_Run2
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run2/HRD_202307141330-21752-0087-GI-Report.pdf
(47, 9)
Checking folder: ./Resources/SOPHiA HRD/HRD_Run3
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run3/HRD_202307211811-21752-0090-GI-Report.pdf
(71, 9)
Checking folder: ./Resources/SOPHiA HRD/HRD_Run4
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run4/HRD_202307211832-21752-0091-GI-Report.pdf
(95, 9)
Checking folder: ./Resources/SOPHiA HRD/HRD_Run5
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run5/HRD_202307262240-21752-0092-GI-Report.pdf
(118, 9)
Checking folder: ./Resources/SOPHiA HRD/HRD_Run6
Extracting GI QA table from ./Resources/SOPHiA HRD/HRD_Run6/HRD_202308022047-21752-0093-GI-Report.pdf
(142, 9)
