In [0]:
import os, sys
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import pandas as pd
from ipyparallel import Client
import numpy as np
import scipy as sp

In [0]:
cd /home/cfriedline/eckertlab/projects/ethan/HiSeq_140603/FASTQ

In [0]:
raw_fastq = !ls *.gz
raw_fastq = [os.path.abspath(x) for x in raw_fastq]

In [0]:
raw_fastq

In [0]:
rc = Client(profile="sge")

In [0]:
lview = rc.load_balanced_view()

In [0]:
@lview.remote()
def get_num_seqs(f):
    cmd = "cat"
    if f.endswith('gz'):
        cmd = 'zcat'
    res = !{cmd} {f} | wc -l
    num = int(res[0])/4
    return f, num

In [0]:
raw_nums = [get_num_seqs(x) for x in raw_fastq]

In [0]:
[x.ready() for x in raw_nums]

In [0]:
[x.r for x in raw_nums]

In [0]:
cd ~/eckertlab/projects/ethan/analysis/samtools1.1

In [0]:
fastq = !ls | grep 'fastq$' | grep -v Foxtail
fastq = [os.path.abspath(x) for x in fastq]

In [0]:
fastq_nums = [get_num_seqs(x) for x in fastq]

In [0]:
sum([x.ready() for x in fastq_nums])

In [0]:
sample_data = []
for x in fastq_nums:
    sample_data.append({"ind": os.path.basename(x.r[0]), 
                       'num': int(x.r[1])})

In [0]:
sample_df = pd.DataFrame(sample_data)

In [0]:
sample_df['fam'] = sample_df.ind.apply(lambda x: x.split("_")[0])
sample_df = sample_df.drop(sample_df[sample_df.fam=='None'].index)

In [0]:
summary_data = []
for group, df in sample_df.groupby('fam'):
    summary_data.append({
          'group': group,
            'mean_num': np.mean(df.num),
            'std_num': np.std(df.num)
        })

In [0]:
summary_df = pd.DataFrame(summary_data)

In [0]:
summary_df.index = summary_df.group

In [0]:
summary_df.head()

In [0]:
@lview.remote()
def get_read_lens(f):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    import numpy as np
    
    def convert_qual(q):
        return ord(q)-33

    lens = []
    quals = []
    with open(f, "rU") as h:
        for title, seq, qual in FastqGeneralIterator(h):
            avg_qual = np.mean([convert_qual(x) for x in qual])
            lens.append(len(seq))
            quals.append(avg_qual)
    return f, np.array(lens), np.array(quals)

In [0]:
read_summary = [get_read_lens(x) for x in fastq]

In [0]:
read_data = []
for x in read_summary:
    res = x.r
    read_data.append({
            'indv': os.path.basename(res[0]),
            'lens': res[1],
            'quals': res[2]
        })

In [0]:
read_df = pd.DataFrame(read_data)

In [0]:
read_df['fam'] = read_df.indv.apply(lambda x: x.split("_")[0])

In [0]:
read_df = read_df.drop(read_df[read_df.fam=='None'].index)

In [0]:
for group, df in read_df.groupby('fam'):
    lens = []
    quals = []
    for x in df.lens:
        lens.extend(x)
        
    for x in df.quals:
        quals.extend(x)
        
    summary_df.ix[group, 'mean_len'] = np.mean(lens)
    summary_df.ix[group, 'std_len'] = np.std(lens)
    summary_df.ix[group, 'mean_qual'] = np.mean(quals)
    summary_df.ix[group, 'std_qual'] = np.std(quals)
    print(group)

In [0]:
summary_df

In [0]:
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
picard = "/home/cfriedline/g/src/picard-tools-2.0.1/picard.jar"

In [0]:
assembly = "/home/cfriedline/g/eassemblies/foxtail2/Green_26_ATCGCGCAA.fastq_31_data_31/contigs.fa_in_map.fa"
assert os.path.exists(assembly)

In [0]:
bams = !ls ../../analysis/*.bam
bams = [os.path.abspath(x) for x in bams]

In [0]:
bams[0]

In [0]:
@lview.remote()
def get_alignment_metrics(args):
    java, picard, assembly, bam = args
    out = "%s.alignmentmetrics" % bam
    !$java -jar $picard CollectAlignmentSummaryMetrics \
    R={assembly} \
    I={bam} \
    O={out}
    return out

In [0]:
metrics = [get_alignment_metrics((java, picard, assembly, x)) for x in bams]

In [0]:
sum([x.ready() for x in metrics])

In [0]:
pd.read_csv(metrics[0].r, sep="\t", comment="#").columns

In [0]:
percent_aligned = {}
for m in metrics:
    fam = os.path.basename(m.r).split("_")[0]
    if not fam in percent_aligned:
        percent_aligned[fam] = []
    percent_aligned[fam].append(pd.read_csv(m.r, sep="\t", comment="#")['PCT_PF_READS_ALIGNED'].values[0])

In [0]:
aligned_df = pd.DataFrame([(k, np.mean(v), np.std(v)) for k,v in percent_aligned.items()],
            columns = ["fam", "mean_aln", "std_aln"])

In [0]:
aligned_df.index = aligned_df.fam

In [0]:
aligned_df = aligned_df.drop("None")

In [0]:
df2 = summary_df.join(aligned_df)

In [0]:
df2