In [0]:
import Bio, os, sys, shutil
from Bio import SeqIO, SearchIO
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
%matplotlib inline
from IPython.parallel import Client
import numpy as np
import scipy as sp
import dill as pickle
from Bio.Blast import NCBIXML
import pandas as pd
from pprint import pprint
from lxml import etree
from __future__ import division
from IPython.display import FileLinks, FileLink
import rpy2.robjects as robjects
import pandas.rpy.common as com
from scipy.stats import gaussian_kde
from Bio import Entrez
import requests, cStringIO
import dill
import types
import sqlite3
import dill
import shutil

In [0]:
conn = sqlite3.connect("/home/cfriedline/gpfs/projects/black_spruce/black_spruce.sqlite")

In [0]:
r = robjects.r
Entrez.email = "cfriedline@vcu.edu"

In [0]:
home = "/gpfs_fs/home/cfriedline/projects/black_spruce"

In [0]:
cd $home

In [0]:
scf_dirs = set()
scf_count = 0
for root, dirs, files in os.walk("."):
    for f in files:
        if f.endswith(".SCF"):
            p = os.path.join(root, f)
            p_new = p.replace(".SCF", ".scf")
            shutil.move(p, p_new)
            scf_dirs.add(os.path.dirname(p_new))
            scf_count += 1
        elif f.endswith(".scf"):
            scf_dirs.add(os.path.dirname(os.path.join(root, f)))
            scf_count+=1
print "found %d scf files in %d dirs" % (scf_count, len(scf_dirs))

    found 7232 scf files in 4 dirs

In [0]:
scf_dirs
seq_dirs = [os.path.abspath("%s_seq" % x) for x in scf_dirs]
seq_dirs

In [0]:
for d in scf_dirs:
    sample_id = d[-3:]
    files = !ls {d}/*.scf
    print sample_id, len(files)
    sql = 'insert into sample (sample_id, tissue, raw_reads) values (?,?,?)'
    conn.execute(sql, [sample_id, sample_id[-1], len(files)])
conn.commit()

In [0]:
rc = Client()
dview = rc[:]
lview = rc.load_balanced_view()

In [0]:
print len(dview)

In [0]:
@lview.remote()
def run_phred(d, phred_cutoff):
    import os
    os.environ['PHRED_PARAMETER_FILE'] = '/Users/chris/src/phred-dist-020425.c-acd/phredpar.dat'
    r = !~/src/phred-dist-020425.c-acd/phred -id {d.replace("_seq", "")} -sd {d} -qd {d} -trim_fasta -trim_alt "" -trim_cutoff {phred_cutoff}
    return r

In [0]:
phred_cutoff = 0.01
phred_res = []
for d in seq_dirs:
    if not os.path.exists(d):
        os.mkdir(d)
    phred_res.append(run_phred(d, phred_cutoff))

In [0]:
reads = {}
for seq_dir in seq_dirs:
    reads[seq_dir] = []
    seq_files = !ls {seq_dir} | grep .seq
    seq_files = [os.path.join(seq_dir, x) for x in seq_files]
    for seq_file in seq_files:
        reads[seq_dir].append(SeqIO.read(seq_file, "fasta"))
print reads.keys()

In [0]:
def get_sample_id(basename):
    sample_id = os.path.basename(basename).split("_")
    if len(sample_id) == 3:
        sample_id = sample_id[1][-3:]
    else:
        sample_id = sample_id[0][-3:]
    return sample_id

good_reads = {}
total_reads = {}
for k, v in reads.items():
    good_reads[k] = []
    total_reads[k] = 0
    for read in v:
        if len(read) >= 100:
            good_reads[k].append(read)
        total_reads[k] += 1
print "good (total) reads in:"
for k, v in good_reads.items():
    sample_id = get_sample_id(k)
    print "%s: %d (%d)" % (os.path.basename(k), len(v), total_reads[k])
    sql = 'update sample set phred_reads=? where sample_id=?'
    conn.execute(sql, [total_reads[k], sample_id])
    sql = 'update sample set length_reads=? where sample_id=?'
    conn.execute(sql, [len(v), sample_id])
conn.commit()

    good (total) reads in:
    im_bscp32N_seq: 1479 (1628)
    im_bscp32C_seq: 1551 (1750)
    BSCP40C_seq: 1693 (1926)
    BSCP40N_seq: 1273 (1849)

In [0]:
def get_summary(lens):
    data = (len(lens), np.mean(lens), np.std(lens), np.min(lens), np.max(lens))
    s = "%d reads, mean(len) = %.2f, sd=%.2f, [%d, %d]" % data
    return s, data

In [0]:
def get_length_dict(read_dict):
    lens = []
    len_dict = {}
    for k, v in read_dict.items():
        len_dict[k] = []
        for r in v:
            lens.append(len(r))
            len_dict[k].append(len(r))
    return lens, len_dict

In [0]:
def read_hist(read_dict):
    lens, len_dict = get_length_dict(read_dict)
    plt.hist(lens)
    plt.xlabel("read length")
    plt.ylabel("count")
    title = get_summary(lens)[0]
    plt.title(title)
    plt.show()
    print title
    for k, v in len_dict.items():
        print os.path.basename(k), get_summary(v)[0]

In [0]:
def read_hist_ind(read_list):
    read_dict = {"dummy":read_list}
    read_hist(read_dict)

In [0]:
read_hist(reads)

In [0]:
lens, len_dict = get_length_dict(reads)
get_summary(lens)
s = 0
for k, v in len_dict.items():
    sample_id = get_sample_id(k)
    total, mean, sd, mmin, mmax = get_summary(v)[1]
    print sample_id, total, mean, sd, mmin, mmax
    s += total
print s
conn.commit()

    7153 reads, mean(len) = 470.17, sd=253.69, [0, 903]
    7153 reads, mean(len) = 470.17, sd=253.69, [0, 903]
    im_bscp32N_seq 1628 reads, mean(len) = 524.10, sd=229.25, [0, 861]
    im_bscp32C_seq 1750 reads, mean(len) = 493.11, sd=227.90, [0, 903]
    BSCP40C_seq 1926 reads, mean(len) = 549.79, sd=245.00, [0, 901]
    BSCP40N_seq 1849 reads, mean(len) = 318.04, sd=241.26, [0, 815]

In [0]:
read_hist(good_reads)

    5996 reads, mean(len) = 556.08, sd=175.99, [100, 903]
    im_bscp32N_seq 1479 reads, mean(len) = 573.59, sd=176.00, [104, 861]
    im_bscp32C_seq 1551 reads, mean(len) = 551.90, sd=167.61, [100, 903]
    BSCP40C_seq 1693 reads, mean(len) = 621.89, sd=158.71, [102, 901]
    BSCP40N_seq 1273 reads, mean(len) = 453.29, sd=159.60, [100, 815]

In [0]:
lens, len_dict = get_length_dict(good_reads)
get_summary(lens)
s = 0
for k, v in len_dict.items():
    sample_id = get_sample_id(k)
    total, mean, sd, mmin, mmax = get_summary(v)[1]
    print sample_id, total, mean, sd, mmin, mmax
    sql = 'insert into sample_stats values (?,?,?,?,?,?)'
    conn.execute(sql, [sample_id, "length_reads", mean, sd, mmin, mmax])
    s += total
print s
conn.commit()

```
40N 1273 453.285938727 159.598494457 100 815
40C 1693 621.893089191 158.71016883 102 901
32C 1551 551.904577692 167.60567435 100 903
32N 1479 573.585530764 176.002648672 104 861
5996
```

In [0]:
names = {'im_bscp32N_seq':'P32N', 
         'im_bscp32C_seq':'P32C', 
         'BSCP40N_seq':'P40N', 
         'BSCP40C_seq':'P40C'}
for k, reads in good_reads.items():
    key = os.path.basename(k)
    with open("%s.fa" % names[key], "w") as out:
        SeqIO.write(reads, out, "fasta")

In [0]:
good_files = !grep -c ">" *.fa | grep -v primer
good_files = [os.path.abspath(x.split(':')[0]) for x in good_files]
print good_files

```python
['/gpfs_fs/home/cfriedline/projects/black_spruce/P32C.fa', 
'/gpfs_fs/home/cfriedline/projects/black_spruce/P32N.fa', 
'/gpfs_fs/home/cfriedline/projects/black_spruce/P40C.fa', 
'/gpfs_fs/home/cfriedline/projects/black_spruce/P40N.fa']
```

In [0]:
class Qual:
    def __init__(self, name):
        self.name = name.split()[0]
        self.description = name
        self.vals = []
        
    def __str__(self):
        return ">%s\n%s" % (self.name, ' '.join(self.vals))
    
    def add_vals(self, line):
        self.vals.append(line)

In [0]:
qual_dirs = seq_dirs

In [0]:
qual_dirs

```python
['/gpfs_fs/home/cfriedline/projects/black_spruce/im_bscp32C_seq',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/BSCP40C_seq',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/BSCP40N_seq',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/im_bscp32N_seq']
 ```

In [0]:
good_read_names = set()
for k, v in good_reads.items():
    for r in v:
        good_read_names.add(r.name)
for qual_dir in qual_dirs:
    print qual_dir
    quals = !ls {qual_dir} | grep .qual
    quals = [os.path.join(qual_dir, x) for x in quals]
    q = None
    qual_list = []
    for qual in quals:
        for line in open(qual):
            line = line.strip()
            if line.startswith(">"):
                q = Qual(line[1:])
                qual_list.append(q)
            else:
                q.add_vals(line)
    key = os.path.basename(qual_dir)
    with open("%s.qual" % names[key], "w") as out:
        for q in qual_list:
            if len(q.vals) > 0:
                out.write("%s\n" % str(q))

In [0]:
good_files

##Must run seqclean on linux (cdbfasta does not run on Mac).

1. Shutdown running mac notebook
1. start linux vm
1. relaunch notebook from shared folder

For example:

    chris@vm:~/projects/black_spruce/seqclean$ ~/src/seqclean-x86_64/seqclean P40C.fa -v ~/src/UniVec/UniVec -s ~/projects/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna

In [0]:
cd ~/gpfs/projects/black_spruce/seqclean/

In [0]:
seq_clean_files = !ls *.clean | grep -v 'all'

In [0]:
seq_clean_files = [os.path.abspath(x) for x in seq_clean_files]

In [0]:
seq_clean_files

In [0]:
seq_clean_reads = {}
for f in seq_clean_files:
    seq_clean_reads[f] = []
    for read in SeqIO.parse(f, "fasta"):
        seq_clean_reads[f].append(read)

In [0]:
read_hist(seq_clean_reads)


    5630 reads, mean(len) = 445.96, sd=144.39, [100, 842]
    P32N.fa.clean 1429 reads, mean(len) = 470.81, sd=159.23, [101, 842]
    P32C.fa.clean 1306 reads, mean(len) = 426.42, sd=135.40, [100, 795]
    P40C.fa.clean 1652 reads, mean(len) = 473.19, sd=133.18, [102, 825]
    P40N.fa.clean 1243 reads, mean(len) = 401.73, sd=135.62, [101, 709]

In [0]:
lens, len_dict = get_length_dict(seq_clean_reads)
get_summary(lens)
s = 0
for k, v in len_dict.items():
    sample_id = os.path.basename(k)[1:4]
    total, mean, sd, mmin, mmax = get_summary(v)[1]
    sql = 'insert into sample_stats values (?,?,?,?,?,?)'
    conn.execute(sql, [sample_id, "seqclean_reads", mean, sd, mmin, mmax])
    s += total
print s
conn.commit()

In [0]:
seqclean_count = !find . -type f | grep '.clean$' | grep -v 'all' | xargs grep -c ">"

In [0]:
for s in seqclean_count:
    data = s.split(":")
    sample_id = os.path.basename(data[0])[1:4]
    sql = 'update sample set seqclean_reads=? where sample_id=?'
    conn.execute(sql, [int(data[1]), sample_id])
    print sample_id, int(data[1])
conn.commit()

    ./seqclean/P32C.fa.clean:1306
    ./seqclean/P32N.fa.clean:1429
    ./seqclean/P40C.fa.clean:1652
    ./seqclean/P40N.fa.clean:1243

##Combine needle and cambium

In [0]:
cd seqclean/

In [0]:
cat P32C.fa.clean P40C.fa.clean > cambium.fa.clean

In [0]:
cat P32N.fa.clean P40N.fa.clean > needle.fa.clean

##Run iAssembler

Again, have to use linux b/c not supported on Mac.  `*.clean` files moved to `seqclean`
 directory for processing
 
 Also, edited `iAssembler.pl` and pipeline files in `bin` to use `#!/usr/bin/env perl` instead of `/usr/bin/perl -w`

For example:

    (conda)chris@vm:~/projects/black_spruce/seqclean$ ~/src/iAssembler-v1.3.2.x64/iAssembler.pl -i P32C.fa.clean 

##Rename unigene files according to source sample

In [0]:
cd ~/gpfs/projects/black_spruce/seqclean

In [0]:
pwd

In [0]:
output_dirs = !ls | grep _output | grep -v 'all'

In [0]:
output_dirs = [os.path.abspath(x) for x in output_dirs]
output_dirs

In [0]:
assembled_files = {}
for o in output_dirs:
    assembled_files[o] = []
    key = os.path.basename(o).split(".")[0]
    print key
    for f in os.listdir(o):
        print f
        if not key in f:
            f_name = "%s_%s" % (key, f)
            print f_name
            shutil.copy(os.path.join(o, f), os.path.join(o, f_name))
            assembled_files[o].append(os.path.join(o, f_name))

In [0]:
assembled_fasta = [] 
for o in output_dirs:
    fasta_files = !find $o | grep '.fasta$' | grep -v '^{o}/unigene_seq.fasta' | grep -v 'decorated' | grep -v 'orfs'
    for f in fasta_files:
        res = !grep -c ">" $f
        print os.path.basename(os.path.dirname(f)), os.path.basename(f), res[0]
        assembled_fasta.append(f)

    P32C.fa.clean_output P32C_unigene_seq.fasta 328
    P32N.fa.clean_output P32N_unigene_seq.fasta 730
    P40C.fa.clean_output P40C_unigene_seq.fasta 434
    P40N.fa.clean_output P40N_unigene_seq.fasta 223
    all_ests.fa.clean_output unigene_seq.fasta 1945

In [0]:
unigene_reads = {}
for f in assembled_fasta:
    unigene_reads[f] = []
    for read in SeqIO.parse(f, "fasta"):
        unigene_reads[f].append(read)
unigene_reads.keys()

In [0]:
for uni in unigene_reads:
    read_hist_ind(unigene_reads[uni])
    print os.path.abspath(uni)

    223 reads, mean(len) = 480.71, sd=230.99, [104, 1918]
    P40N_unigene_seq.fasta
    
    730 reads, mean(len) = 516.13, sd=186.97, [104, 1377]
    P32N_unigene_seq.fasta
    
    328 reads, mean(len) = 535.98, sd=202.81, [107, 1343]
    P32C_unigene_seq.fasta
    
    1945 reads, mean(len) = 553.77, sd=191.78, [100, 2077]
    unigene_seq.fasta
    
    434 reads, mean(len) = 531.24, sd=187.82, [102, 1748]
    P40C_unigene_seq.fasta

##Decorate fasta files with sample (for blast2go)

In [0]:
unigene_files = ["/home/cfriedline/gpfs/projects/black_spruce/seqclean/P32N.fa.clean_output/P32N_unigene_seq.fasta",
"/home/cfriedline/gpfs/projects/black_spruce/seqclean/P32C.fa.clean_output/P32C_unigene_seq.fasta",
"/home/cfriedline/gpfs/projects/black_spruce/seqclean/P40C.fa.clean_output/P40C_unigene_seq.fasta",
"/home/cfriedline/gpfs/projects/black_spruce/seqclean/P40N.fa.clean_output/P40N_unigene_seq.fasta",
"/home/cfriedline/gpfs/projects/black_spruce/seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta"]
for u in unigene_files:
    print u
    key = os.path.basename(u).split("_")[0]
    recs = []
    for rec in SeqIO.parse(u, "fasta"):
        rec.id = "%s_%s" % (key, rec.id)
        rec.description = ""
        recs.append(rec)
    out_file = "%s_decorated.fasta" % u
    print out_file
    SeqIO.write(recs, open(out_file,"w"), "fasta")

##Blast hits (iPlant)

    cfriedline@vm64-60:~/projects/black_spruce$ ~/src/ncbi-blast-2.2.29+/bin/blastx -db ~/nr/nr -max_target_seqs 10 -outfmt 5 -num_threads 8 -evalue 1e-5 -query P32C.fa.clean_output/P32C_unigene_seq.fasta -out P32C_blast.xml

##Download blast files from iPlant atmosphere

In [0]:
cd ~/gpfs/projects/black_spruce/

In [0]:
!scp atmo:/home/cfriedline/projects/black_spruce/*blast.xml .

In [0]:
blast_files = !ls *_blast.xml | grep -v 'all'

In [0]:
blast_files

##Process blast

In [0]:
blast_files = [os.path.abspath(x) for x in blast_files]

In [0]:
blast_files

```python
['/gpfs_fs/home/cfriedline/projects/black_spruce/P32C_blast.xml',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/P32N_blast.xml',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/P40C_blast.xml',
 '/gpfs_fs/home/cfriedline/projects/black_spruce/P40N_blast.xml']
```

In [0]:
def plot_hist(data, title):
    plt.hist(data)
    plt.title(title)
    plt.show()

##Get top blast hit

In [0]:
aln_limit = 1
hsp_limit = 1
for f in blast_files:
    query_percs = [] 
    ident_percs = []
    for record in NCBIXML.parse(open(f)):
        for i, aln in enumerate(record.alignments):
            if i == aln_limit: break
            for j, hsp in enumerate(aln.hsps):
                if j == hsp_limit: break
                query_length = ((hsp.query_end-hsp.query_start)+1.0)
                query_perc = query_length/record.query_length
                query_percs.append(query_perc)
                ident_perc = float(hsp.identities)/hsp.align_length
                ident_percs.append(ident_perc)
            break
    plot_hist(query_percs, "query percs %s" % os.path.basename(f))
    plot_hist(ident_percs, "ident percs %s" % os.path.basename(f))

## Filter Blast hits

In [0]:
def create_list_chunks(data):
    chunk_size = 200 
    chunks = [data[i:i+chunk_size] for i in xrange(0,len(data),chunk_size)]
    return chunks

def create_id_string(id_list):
    return '&'.join(["id=%s" % x for x in id_list])

def get_organism_for_gi(gi_list):
    data = {}
    search_results = Entrez.read(Entrez.epost("protein", id=",".join(gi_list)))
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"] 
    handle = Entrez.efetch(db="protein", rettype="gb", webenv=webenv, query_key=query_key)
    records = SeqIO.parse(handle, 'genbank')
    for i, record in enumerate(records):
        assert isinstance(record, Bio.SeqRecord.SeqRecord)
        organism = None
        for feature in record.features:
            if feature.type == "source":
                organism = feature.qualifiers['organism'][0]
        gi = record.annotations['gi']
        data[gi] = organism
    return data

def add_division_to_tax_id(data):
    tax_ids = set([v[0] for k,v in data.items()])
    search_results = Entrez.read(Entrez.epost("taxonomy", id=",".join([x for x in tax_ids if x != "NOT_FOUND"])))
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"] 
    elems = Entrez.read(Entrez.efetch(db="taxonomy", webenv=webenv, query_key=query_key))
    tax_div = {}
    for elem in elems:
        tax_div[elem['TaxId']] =elem['Division']
    for gi, tax_data in data.items():
        if tax_data[0] in tax_div:
            tax_data.append(tax_div[tax_data[0]])
    return data    
    
def process_elink_xml(xml_list):
    res = {}
    for chunk in xml_list:
        for elem in chunk:
            gi = elem['IdList'][0] 
            tax_id = "NOT_FOUND"
            try:
                tax_id = elem['LinkSetDb'][0]['Link'][0]["Id"]
            except:
                pass
            res[gi] = [tax_id]
    return add_division_to_tax_id(res)

def get_tax_divs_for_gis(id_list):    
    dbfrom = "protein"
    db = "taxonomy"
    id_chunks = create_list_chunks(id_list)
    res = []
    for i, chunk in enumerate(id_chunks):
        print "at chunk %d/%d" % (i, len(id_chunks))
        args = "dbfrom=%s&db=%s&%s" % (dbfrom,
                                       db,
                                       create_id_string(chunk))
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?%s" % args
        r = requests.post(url)
        xml = Entrez.read(cStringIO.StringIO(r.text))
        res.append(xml)
    return process_elink_xml(res)
        
def get_divisions_for_hits(blastfile):
    gi_set = set()
    qresults = (record for record in SearchIO.parse(open(blastfile), "blast-xml"))
    for qresult in qresults:
        for hit in qresult:
            gi = hit.id.split("|")[1]
            gi_set.add(gi)
    id_list = list(gi_set)
    gi_tax = get_tax_divs_for_gis(id_list)
    return gi_tax

division_data = {}
for f in blast_files:
    divisions = get_divisions_for_hits(f)
    division_data[f] = divisions

In [0]:
divisions = {}
for f, data in division_data.items():
    print f
    for key, div in data.items():
        if key not in divisions:
            divisions[key] = div

In [0]:
dill.dump(divisions, open(os.path.join(home, "divisions.pkl"), "w"))

In [0]:
for k, v in divisions.items():
    print k, v
    break

In [0]:
def blast_hit_filter(hit):
    gi = hit.id.split("|")[1]
    for hsp in hit.hsps:
        query_perc = hsp.query_span/hit.query_len
        ident_perc = hsp.ident_num/hsp.aln_span
        if query_perc >= hit.min_query_perc:
            if ident_perc >= hit.min_ident_perc:
                if len(hit.divisions[gi]) == 2 and hit.divisions[gi][1] == hit.division:
                    return True
            else:
                return False
        else:
            return False

def filter_blast_hits(blastfile, min_query_perc, min_ident_perc, show_plot):
    from Bio import SearchIO
    f = blastfile
    key = os.path.basename(f).split("_")[0]
    print key, min_query_perc, min_ident_perc
    filtered_records = []
    bad_records = []
    bit_scores = []
    e_values = []
    query_percs = []
    good_query_percs = []
    ident_percs = []
    good_ident_percs = []
    num_filtered = 0
    num_total = 0
    bad_ident_records = []
    bad_query_records = []
    division = "Plants"
    qresults = (record for record in SearchIO.parse(open(blastfile), "blast-xml"))
    for qresult in qresults:
        qresult.id = "%s_%s" % (key, qresult.id) #decorate as in decorated unigene fasta (for blast2go)
        for hit in qresult.hits:
            setattr(hit, "query_len", qresult.seq_len)
            setattr(hit, "min_query_perc", min_query_perc)
            setattr(hit, "min_ident_perc", min_ident_perc)
            setattr(hit, "divisions", divisions)
            setattr(hit, "division", division)
        filtered = qresult.hit_filter(blast_hit_filter)
        filtered_records.append(filtered)
             
#     if show_plot:
#         plot_hist(bit_scores, "bit")
#         plot_hist(e_values, "evalue")
#         plot_hist(query_percs, "query perc")
#         plot_hist(ident_percs, "ident perc")
#         plot_hist(good_query_percs, "good query perc")
#         plot_hist(good_ident_percs, "good ident perc")
#     print num_filtered, num_total
#     #print np.mean(e_values), np.std(e_values), np.min(e_values), np.max(e_values)
    out_file = "%s_filtered_%.2f_query_%.2f_ident.xml" % (f, min_query_perc, min_ident_perc)
#     bad_file = "%s_bad_%.2f_query_%.2f_ident.xml" % (f, min_query_perc, min_ident_perc)
#     bad_query_file = "%s_bad_query_%.2f_query_%.2f_ident.xml" % (f, min_query_perc, min_ident_perc)
#     bad_ident_file = "%s_ident_query_%.2f_query_%.2f_ident.xml" % (f, min_query_perc, min_ident_perc)
    try:
        SearchIO.write(filtered_records, out_file, "blast-xml")
#         SearchIO.write(bad_records, bad_file, "blast-xml")
#         SearchIO.write(bad_query_records, bad_query_file, "blast-xml")
#         SearchIO.write(bad_ident_records, bad_ident_file, "blast-xml")
    except:
        pass
    return filtered_records, num_filtered, num_total

In [0]:
blast_files

In [0]:
# filters are (min_query_perc, min_ident_perc)
filtered_df = pd.DataFrame(columns=["sample", "min_query", "min_ident", "filtered", "total"])

blast_filters = [(0,0),
           (0.5,0.5),
           (0.8,0.8),
           (0.5,0),
           (0.8,0),
           (0,0.5),
           (0,0.8),
           (0,0.3),
           (0.3,0),
           (0.3,0.3)]
show_plot = False
for filt in blast_filters:
    for f in blast_files[0:1]: #only process all_blast.xml
            res = filter_blast_hits(f, filt[0], filt[1], show_plot)
            filtered_df = filtered_df.append({"sample":os.path.basename(f), 
                                              "min_query": filt[0], 
                                              "min_ident": filt[1], 
                                              "filtered": res[1], 
                                              "total": res[2]},
                                             ignore_index=True)

##Blast2GO

version 2.7.2, jre 1.7.0_65

pro server USA1-b2g_may14

1. import sequences (decorated unigenes, e.g., PC32_UN001 vs UN001)
1. import blast results
1. unselect hits without blast
1. run mapping step
1. run annotation step
1. run interpro
1. merge interpro
1. run annex
1. run go-enzymecode
1. load kegg maps

###Merge
1. create new project
1. add dats from cambium (or needle)
1. redownload kegg maps

###Export
1. file -> expoort -> generic export - > sequence names for each cambium and needle dat 

###Exact test
1. merge cambium and needle dat
1. input test as needle and ref as cambium
1. 0.05/FDR exact test
1. input test as cambium and ref as needle
1. exact test
1. export results < 0.05 for all and most specific

##Collapse unigenes by tissue

In [0]:
decorated = !find . | grep decorated | grep -v 'all'

In [0]:
decorated = [os.path.abspath(x) for x in decorated]
decorated

```python
['/Users/chris/projects/black_spruce/seqclean/P32C.fa.clean_output/P32C_unigene_seq.fasta_decorated.fasta',
'/Users/chris/projects/black_spruce/seqclean/P32N.fa.clean_output/P32N_unigene_seq.fasta_decorated.fasta',
'/Users/chris/projects/black_spruce/seqclean/P40C.fa.clean_output/P40C_unigene_seq.fasta_decorated.fasta',
'/Users/chris/projects/black_spruce/seqclean/P40N.fa.clean_output/P40N_unigene_seq.fasta_decorated.fasta']
```

In [0]:
seq_dict = {}
for d in decorated:
    tissue = os.path.basename(d).split("_")[0][-1]
    if not tissue in seq_dict:
        seq_dict[tissue] = []
    for rec in SeqIO.parse(d, "fasta"):
        seq_dict[tissue].append(rec)

In [0]:
cd $home

In [0]:
combined_outfiles = []
for tissue, seq_list in seq_dict.items():
    outfile = "%s_unigenes_combined.fasta" % tissue
    combined_outfiles.append(outfile)
    SeqIO.write(seq_list, outfile, "fasta")

In [0]:
!cat C_unigenes_combined.fasta >> all_unigenes_combined.fasta

In [0]:
!cat N_unigenes_combined.fasta >> all_unigenes_combined.fasta

##Collapse all ESTs into a single file
(decorated by the source tissue)

In [0]:
cd ~/gpfs/projects/black_spruce

In [0]:
est_files = !ls seqclean/*.fa | grep -v 'all'

In [0]:
est_files

In [0]:
sql = "insert into sample (sample_id, tissue) values (?,?)"
conn.execute(sql, ["all", "all"])
conn.commit()

In [0]:
est_seqs = []
seq_lens = {}
for e in est_files:
    base = os.path.basename(e)
    for rec in SeqIO.parse(e, "fasta"):
        rec.description = rec.description.replace(rec.id, "")
        rec.id = "%s_%s" % (base, rec.id)
        est_seqs.append(rec)
        seq_lens[rec.id] = len(rec)

In [0]:
seq_lens

In [0]:
SeqIO.write(est_seqs, "seqclean/all_ests.fa", "fasta")

In [0]:
cd seqclean/

In [0]:
all_length_reads = !grep -c ">" all_ests.fa #this number should equal sum(length_reads) in the db
sql = "update sample set length_reads=? where sample_id=?"
conn.execute(sql, [int(all_length_reads[0]), "all"])
conn.commit()

In [0]:
all_length_reads

```python
['5996']
```

##Run seqclean on godel for all ESTs

runs with blast-2.2.26, after using formatdb -pF on the E. coli genome below

    ~/data7/src/seqclean-x86_64/seqclean all_ests.fa -v ~/data7/src/UniVec -s ~/data7/projects/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna
    
    Collecting cleaning reports

    **************************************************
    Sequences analyzed:      5996
    -----------------------------------
                       valid:      5938  (2842 trimmed)
                     trashed:        58
    **************************************************
    ----= Trashing summary =------
           by 'NC_010473.fna':       34
                    by 'dust':        1
                  by 'shortq':       23
    ------------------------------
    Output file containing only valid and trimmed sequences: all_ests.fa.clean
    For trimming and trashing details see cleaning report  : all_ests.fa.cln
    --------------------------------------------------
    seqclean (all_ests.fa) finished on machine godel97

In [0]:
all_seqclean = !grep -c ">" all_ests.fa.clean
sql = "update sample set seqclean_reads=? where sample_id=?"
conn.execute(sql, [int(all_seqclean[0]), "all"])
conn.commit()
all_seqclean

```python
['5938']
```

##Run seqclean with vecscreen params

    -q -5 -G 3 -E 3 -F "m D" -e 700 -Y 1.75e12

In [0]:
~/data7/src/seqclean-x86_64/seqclean all_ests.fa -v ~/data7/src/UniVec -s ~/data7/projects/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna

##Run iAssembler on all ESTs

    ~/data7/src/iAssembler-v1.3.2.x64/iAssembler.pl -i all_ests.fa.clean 

In [0]:
all_est_dir = "~/gpfs/projects/black_spruce/seqclean/all_ests.fa.clean_output"

In [0]:
cd $all_est_dir

In [0]:
!head contig_member

In [0]:
def create_est_count_file(contig_member_file):
    counts_per_unigene = []
    est_counts = 0
    unigene_counts = 0
    keys = ["P32C", "P40C", "P32N", "P40N"]
    tissue_dict = {}
    with open("%s.counts" % contig_member_file, "w") as o:
        o.write("unigene\t%s\n" % '\t'.join(keys))
        for line in open(contig_member_file):
            unigene_counts += 1
            counts = {}
            for k in keys:
                counts[k] = 0
            line = line.split()
            for elem in line[1:]:
                tissue = elem.split(".")[0]
                if tissue in keys:
                    counts[tissue] += 1
                    
                if not tissue in tissue_dict:
                    tissue_dict[tissue] = []
                
                tissue_dict[tissue].append(elem)
                est_counts += 1
            vals = []
            for k in keys:
                vals.append(counts[k])
            counts_per_unigene.append(sum(vals))
            o.write("%s\t%s\n" % (line[0], '\t'.join([str(x) for x in vals])))
    print "%d ESTs in %d Unigenes" % (est_counts, unigene_counts)
    return counts_per_unigene, tissue_dict

counts_per_unigene, tissue_dict = create_est_count_file("contig_member")

print "counts per unigene: mean=%.2f, sd=%.2f, min=%d, max=%d" % (np.mean(counts_per_unigene),
                                                  np.std(counts_per_unigene),
                                                  np.min(counts_per_unigene),
                                                  np.max(counts_per_unigene))
for k, v in tissue_dict.items():
    print k, get_summary([seq_lens[x] for x in v])

In [0]:
sql = 'insert into unigene (unigene_id, seq, length) values (?,?,?)'
for seq in Bio.SeqIO.parse("all_unigene_seq.fasta", "fasta"):
    conn.execute(sql, [seq.id, str(seq.seq), len(seq)])
conn.commit()

    5938 ESTs in 1945 Unigenes
    counts per unigene: mean=3.05, sd=10.21, min=1, max=274
    P32N 1475 reads, mean(len) = 574.55, sd=175.23, [104, 861]
    P40C 1677 reads, mean(len) = 626.44, sd=152.31, [102, 901]
    P40N 1260 reads, mean(len) = 455.70, sd=158.43, [100, 815]
    P32C 1526 reads, mean(len) = 557.14, sd=163.35, [100, 903]


## Counts in assembly by tissue

In [0]:
cd ~/gpfs/projects/black_spruce/

In [0]:
uni_counts = pd.read_csv("seqclean/all_ests.fa.clean_output/contig_member.counts", sep="\t", index_col=0)

In [0]:
sql = 'insert into unigene_sample (unigene_id, sample_id, assembled_reads) values (?,?,?)'
for row in uni_counts.index:
    unigene_id = row
    for col in uni_counts.columns:
        sample_id = col[1:]
        conn.execute(sql, [unigene_id, sample_id, uni_counts.ix[row, col]])
conn.commit()

In [0]:
uni_counts.apply(np.sum)

    P32C    1526
    P40C    1677
    P32N    1475
    P40N    1260
    dtype: int64
    
    also: SELECT sample_id, sum(num_reads) FROM unigene_sample group by sample_id

## Blast unigenes against nr