In [0]:
import os

In [0]:
from matplotlib_venn import venn2, venn2_circles
import matplotlib.pyplot as plt
%matplotlib inline
from Bio import SearchIO

##Download from plantgenie.org FTP

In [0]:
blastx = "/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/blastx"
tblastx = "/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/tblastx"

In [0]:
[os.path.exists(x) for x in [blastx, tblastx]]

##Blast against all peptides

In [0]:
!grep -c ">" seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta

In [0]:
!$blastx -query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta \
-db Z4006_Gene_Prediction/Pabies1.0-all-pep/Pabies1.0-all-pep.faa \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 5 \
-out abies_all.xml

##Blast against high quality pepties

In [0]:
!$blastx -query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta \
-db Z4006_Gene_Prediction/Pabies1.0-high-confidence-pep/Pabies1.0-HC-pep.faa \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 5 \
-out abies_hc.xml

##Blast against NS Transcriptome from Chen et al 2012

In [0]:
cd ~/g/projects/black_spruce

In [0]:
!/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/makeblastdb -in pa_tgicl95_151.fa -dbtype nucl

In [0]:
!$tblastx -query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta \
-db pa_tgicl95_151.fa \
-num_threads 12 \
-max_target_seqs 1 \
-outfmt 5 \
-out abies_chen.xml

##Process results

In [0]:
def percent_id(hsp):
    return hsp.ident_num*100./hsp.aln_span

In [0]:
def query_perc(query_len, query_span):
    return query_span*100./query_len

In [0]:
def good_hit(res, hsp):
    if query_perc(res.seq_len, hsp.query_span) <= 70:
        return False
    if percent_id(hsp) <= 40:
        return False
    return True

In [0]:
qresults = SearchIO.parse("abies_chen.xml", "blast-xml")
chen_hsps = []
chen_discard = []
percent_ids = []
query_percs = []
discard_percent_ids = []
discard_query_percs = []
putative= []
no_hits = []

with open("chen_blast.txt", "w") as o:
    o.write("%s\n" % "\t".join(["query_id",
                               "hit_id",
                               "query_start",
                               "query_end",
                               "hit_start",
                               "hit_end",
                                "query_length",
                               "hit_frame",
                               "e_value",
                                "percent_id",
                               "query_length_percent"]))
    for res in qresults:
        if len(res.hits) == 0:
            no_hits.append(res)
        for hsp in res.hsps: # get top hit hsp
            qp = query_perc(res.seq_len, hsp.query_span)
            pi = percent_id(hsp)
            query_percs.append(qp)
            percent_ids.append(pi)
            if good_hit(res, hsp):
                chen_hsps.append((res,hsp))
            else:
                chen_discard.append((res,hsp))
                discard_percent_ids.append(pi)
                discard_query_percs.append(qp)

                if qp > 50 and pi > 30 and res.seq_len > 600:
                    putative.append((res, hsp))
                    
            o.write("%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%g\t%.2f\t%.2f\n" % (hsp.query_id, 
                                                                      hsp.hit_id, 
                                                                      hsp.query_start, 
                                                                      hsp.query_end, 
                                                                      hsp.hit_start, 
                                                                      hsp.hit_end,
                                                                      res.seq_len,
                                                                      hsp.hit_frame,
                                                                      hsp.evalue,
                                                                      pi,
                                                                      qp))
            
            break

In [0]:
pwd

In [0]:
no_hits

In [0]:
len(chen_hsps), len(putative), len(chen_discard), len(no_hits)

In [0]:
[x.seq_len for x in no_hits]

In [0]:
for x in chen_hits:
    print x

In [0]:
plt.hist(query_percs)
plt.title("all query percentage")
plt.show()
plt.hist(percent_ids)
plt.title("all percent id")
plt.show()

plt.hist(discard_query_percs)
plt.title("discard query percentage")
plt.show()
plt.hist(discard_percent_ids)
plt.title("discard percent id")
plt.show()

plt.scatter(discard_percent_ids, discard_query_percs)
plt.xlabel("percent id")
plt.ylabel("query percentage")
plt.title("discarded")
plt.show()

In [0]:
x = []
y = []
for res, hsp in chen_discard:
    x.append(res.seq_len)
    y.append(hsp.query_span)
plt.scatter(x, y)
plt.xlabel("seq len")
plt.ylabel("query span")
plt.ylim((-10, max(x)))
plt.show()

In [0]:
for res,hsp in chen_discard:
    print res,hsp

In [0]:
for res,hsp in chen_hsps:
    print res
    print hsp
    print hsp.query_span
    break

In [0]:
qresults = SearchIO.parse("abies_hc.xml", "blast-xml")
good_hc_hits = []
for res in qresults:
    for hsp in res.hsps: # get top hit hsp
        if good_hit(res, hsp):
            #print hsp
            good_hc_hits.append(hsp.hit_id.split()[0])
            break

In [0]:
qresults = SearchIO.parse("abies_all.xml", "blast-xml")
good_all_hits = []
for res in qresults:
    for hsp in res.hsps:
        if good_hit(res, hsp):
            #print hsp
            good_all_hits.append(hsp.hit_id.split()[0])
            break

In [0]:
print len(good_all_hits), len(good_hc_hits)

In [0]:
print len(set(good_all_hits)), len(set(good_hc_hits))

In [0]:
len(set(good_all_hits).union(set(good_hc_hits)))

In [0]:
plt.figure(figsize=(5,5))
v = venn2([set(good_hc_hits),set(good_all_hits)], set_labels=["High-quality", "All genes"])
c = venn2_circles([set(good_hc_hits),set(good_all_hits)],
                  linestyle="solid",
                  linewidth=0.7)
plt.title("Blastx of 1945 unigenes against all and high quality gene models from P. abies")
plt.show()

In [0]:
len(set(good_all_hits) - set(good_hc_hits))

In [0]:
all_ids = !/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/blastdbcmd \
-db Z4006_Gene_Prediction/Pabies1.0-all-pep/Pabies1.0-all-pep.faa \
-entry 'all' \
-outfmt '%o %t %s'

In [0]:
hc_ids = !/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/blastdbcmd \
-db Z4006_Gene_Prediction/Pabies1.0-high-confidence-pep/Pabies1.0-HC-pep.faa \
-entry 'all' \
-outfmt '%o %t %s'

In [0]:
hc_titles = [x.split()[1] for x in hc_ids]
all_titles = [x.split()[1] for x in all_ids]

In [0]:
hc_titles[0:5]

In [0]:
all_titles[0:5]

In [0]:
plt.figure(figsize=(5,5))
v = venn2([set(hc_titles),set(all_titles)], set_labels=["High-quality", "All genes"])
c = venn2_circles([set(hc_titles[2:]),set(all_titles[2:])],
                  linestyle="solid",
                  linewidth=0.7)
plt.title("blastdbcmd -entry 'all'")
plt.show()

In [0]:
hc_title_set = set(hc_titles)
all_title_set = set(all_titles)

In [0]:
def format_to_fasta(line):
    data = line.split()
    seq = data[-1]
    title = "|".join(data[:-1])
    return ">%s\n%s" % (title, seq)
with open("abies_hc.fasta", "w") as o:
    for elem in hc_ids:
        if elem.split()[1] in good_hc_hits:
            o.write("%s\n" % format_to_fasta(elem))
with open("abies_all.fasta", "w") as o:
    for elem in all_ids:
        if elem.split()[1] in good_all_hits:
            o.write("%s\n" % format_to_fasta(elem))

In [0]:
!grep -c ">" abies_hc.fasta

In [0]:
!grep -c ">" abies_all.fasta