# 1. Parameters

In [1]:
from pathlib import Path
import pandas as pd
import io

dataset_input_reads = {
    '0810PADBR-1': Path('cases/full/0810PADBR-1_input-files-case.tsv'),
    '1405WAEXK-1': Path('cases/full/1405WAEXK-1_input-files-case.tsv'),
    '1408MLGX6-3WGS': Path('cases/full/1408MLGX6-3WGS_input-files-case.tsv'),
    '1203NYJAP-1 - Tuna Scrape Outbreak': Path('cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')
}
dataset_input_reads

{'0810PADBR-1': PosixPath('cases/full/0810PADBR-1_input-files-case.tsv'),
 '1405WAEXK-1': PosixPath('cases/full/1405WAEXK-1_input-files-case.tsv'),
 '1408MLGX6-3WGS': PosixPath('cases/full/1408MLGX6-3WGS_input-files-case.tsv'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')}

# 2. Reads datasets input files

In [2]:
dataset_input_dfs = {}

for dataset in dataset_input_reads:
    input_df = pd.read_csv(dataset_input_reads[dataset], sep='\t')
    
    dataset_input_dfs[dataset] = input_df.set_index('Sample')
    
dataset_input_dfs['1405WAEXK-1']

Unnamed: 0_level_0,Assemblies,Reads1,Reads2
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014C-3598,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3599,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3600,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3656,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3655,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3840,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3857,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3907,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...
2014C-3850,,/home/CSCScience.ca/apetkau/workspace/genomics...,/home/CSCScience.ca/apetkau/workspace/genomics...


# 3. Get fastq stats

In [3]:
def read_dataset_stats(dataset: str) -> pd.DataFrame:
    reads1_list = dataset_input_dfs[dataset]['Reads1'].tolist()
    reads2_list = dataset_input_dfs[dataset]['Reads2'].tolist()
    files = ' '.join(reads1_list + reads2_list)
    stats_str = !seqkit stats --threads 32 --basename --tabular {files}

    dataset_df = pd.read_csv(io.StringIO('\n'.join(stats_str)), sep='\t')
    dataset_df['Dataset'] = dataset
    dataset_df['Sample'] = dataset_df['file'].str.replace('_[12].fastq.gz$', '', regex=True)
    grouped_df = dataset_df.groupby('Sample').agg({'Dataset': 'first', 'num_seqs': 'sum', 'sum_len': 'sum'})
    grouped_df = grouped_df.rename({'num_seqs': 'Number reads', 'sum_len': 'Number nucleotides'}, axis='columns')
    return grouped_df

dataset_dfs = []
for dataset in dataset_input_dfs:
    dataset_df = read_dataset_stats(dataset)
    dataset_dfs.append(dataset_df)

read_stats_df = pd.concat(dataset_dfs)
read_stats_df

Unnamed: 0_level_0,Dataset,Number reads,Number nucleotides
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014D-0067,0810PADBR-1,4401384,440138400
2014D-0068,0810PADBR-1,4535468,453546800
2014D-0070,0810PADBR-1,4508506,450850600
2014D-0189,0810PADBR-1,3627810,362781000
D5663,0810PADBR-1,8841554,884155400
...,...,...,...
CFSAN000970,1203NYJAP-1 - Tuna Scrape Outbreak,2149054,324507154
CFSAN001112,1203NYJAP-1 - Tuna Scrape Outbreak,1510070,223192700
CFSAN001115,1203NYJAP-1 - Tuna Scrape Outbreak,1589670,234114048
CFSAN001118,1203NYJAP-1 - Tuna Scrape Outbreak,1055848,156000507


# 4. Save fastq stats

In [4]:
read_stats_df.to_csv('cases/full/reads-stats.tsv', sep='\t')