In [20]:
# This data folder should contain the folder as specified in info.json file
DATA_PATH = "DATA_DIR_TO_SPECIFY"
EXPORT_PATH = "DIR_TO_SAVE_OUTPUT"
# Common path to fastq folder in each data folder
PATH_TO_FASTQ = "PATH_TO_FASTQ_FOLDER"
INFO_FILE = "info.json"

In [2]:
import json, glob, zipfile,os
from sequana import fastqc
from IPython.display import HTML
import pandas as pd

In [3]:
data = json.loads(open(INFO_FILE).read())

sample_names = []
for group in ['A', 'B', 'C', 'D', 'E']:
    sample_names.extend(sorted(data['groups'][group]))


# Reads all data from all fastqc reports

In [21]:
all_results = {}
for key, value in data['runs'].items():
    print("reading {}".format(key))
    path = DATA_PATH + value["path"] + PATH_TO_FASTQ
    filenames  = glob.glob(path+"/*")
    ff = fastqc.FastQC()
    for filename in filenames:
        sample = os.path.basename(filename)
        archive = filename + "/{}_R1_001_fastqc.zip".format(sample)
        ff.read_sample(archive, sample)
    all_results[key] = ff

reading arbo_double_is
reading arbo_double_ms
reading arbo_double_10plex_ms
reading arbo_single_is
reading arbo_single_ms
reading illumina_resv_is
reading illumina_resv_ms
reading illumina_resv_ns
reading meta_is
reading meta_ns
reading twist_cov_is
reading twist_cov_ms
reading twist_cov_ns
reading twist_panviral_is
reading twist_panviral_ns
reading twist_cov_10plex_ms
reading twist_cov_LB_ms


# Extract relevant data and store in single dataframe

In [23]:
X, Y, MQ, GC, TS, DUP, groups, sequencers = [], [], [], [], [], [], [], []
for name in all_results.keys():
    samples = sorted(all_results[name].fastqc_data.keys())
    for sample in samples:
        if sample.startswith("Controle") or sample.startswith('Cneg'):
            continue
        X.append(name)
        
        mq = all_results[name].fastqc_data[sample]['basic_statistics']['mean_quality']
        gc = all_results[name].fastqc_data[sample]['basic_statistics']['%GC']
        ts = all_results[name].fastqc_data[sample]['basic_statistics']['Total Sequences']
        dup = all_results[name].fastqc_data[sample]['basic_statistics']['total_deduplicated_percentage']
        MQ.append(mq)
        GC.append(gc)
        TS.append(ts)
        DUP.append(dup)
        shortname = sample.split("_")[0].split("-")[0]
        Y.append(shortname)
        groups.append(data["samples"][shortname])
        sequencers.append(name.split("_")[-1])


In [24]:
df = pd.DataFrame({"run": X, "sample": Y, "mean_quality": 
                   MQ, "gc": GC, "duplication": DUP, "nreads": TS, 
                   "group": groups, "sequencer": sequencers})

# Data extraction, manipulation

In [26]:
dup = df.pivot_table(index="run", columns="sample")['duplication']
gc = df.pivot_table(index="run", columns="sample")['gc']
nreads = df.pivot_table(index="run", columns="sample")['nreads'] # .fillna(0).astype('int32')
qual = df.pivot_table(index="run", columns="sample")['mean_quality']


In [27]:
mu_qual = qual.mean(skipna=True)
std_qual = qual.std(skipna=True)
mu_gc = gc.mean(skipna=True)
std_gc = gc.std(skipna=True)
mu_nreads = nreads.mean(skipna=True)
std_nreads = nreads.std(skipna=True)
mu_dup = dup.mean(skipna=True)
std_dup = dup.std(skipna=True)


In [28]:
(df.groupby(["run", "sequencer"])['nreads'].sum()/1000000).groupby('sequencer').describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sequencer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
is,6.0,4.618666,1.610151,1.343326,5.092921,5.26069,5.350345,5.430061
ms,7.0,19.884834,1.68456,17.955639,18.767226,20.001158,20.308606,23.085373
ns,4.0,357.885203,197.043626,68.620604,331.728133,425.938559,452.095629,511.04309


In [30]:
summary = pd.concat([mu_qual, std_qual, mu_gc, std_gc, mu_nreads, std_nreads, mu_dup, std_dup], axis=1)
summary.columns = ["quality_mean", "quality_std", "gc_mean", "gc_std", "nreads_mean", "nreads_std", "duplicated_mean", "duplicated_std"]
summary = summary.loc[[str(x) for x in sample_names]]
summary

Unnamed: 0_level_0,quality_mean,quality_std,gc_mean,gc_std,nreads_mean,nreads_std,duplicated_mean,duplicated_std
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4520,35.515694,1.460262,43.466667,4.983783,6748163.0,12768250.0,43.118753,26.791542
4660,35.703654,1.585493,44.4,7.623273,5040167.0,9568181.0,33.200421,26.278941
4716,35.761965,1.579649,45.066667,5.049281,4032054.0,7902824.0,49.879232,28.025383
4885,35.724317,1.530337,44.8,5.212622,4518900.0,9335046.0,54.246544,28.366015
4653,35.869663,1.551246,44.470588,7.78715,7066043.0,13905340.0,26.711989,23.163111
4676,35.935952,1.523026,44.647059,5.883676,4354055.0,7399865.0,40.390964,32.012927
4697,35.851697,1.501272,49.529412,3.104788,3355449.0,7514463.0,51.226814,21.708977
4707,35.871162,1.514475,45.0,4.677072,4709817.0,10169580.0,52.696021,29.715973
4673,35.7259,1.487867,50.0,4.046604,2978015.0,6334108.0,44.168179,22.246067
4688,35.864936,1.465228,44.352941,3.999081,8180273.0,17053920.0,49.439638,28.293325


In [None]:
dup.to_csv(EXPORT_PATH + "Table_QC_dup.csv")
gc.to_csv(EXPORT_PATH + "Table_QC_gc.csv")
qual.to_csv(EXPORT_PATH + "Table_QC_mean_qual.csv")
nreads.to_csv(EXPORT_PATH + "Table_QC_nreads.csv")
summary.to_csv(EXPORT_PATH + "Table_summary.csv")