In []:
from matplotlib_venn import venn2, venn2_circles
import matplotlib.pyplot as plt
%matplotlib inline
from Bio import SearchIO

##Download from plantgenie.org FTP

In []:
blastx = "/Users/chris/bnfo/ncbi-blast-2.2.30+/bin/blastx"

##Blast against all peptides

In []:
!grep -c ">" seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta

In []:
!$blastx -query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta \
-db Z4006_Gene_Prediction/Pabies1.0-all-pep/Pabies1.0-all-pep.faa \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 5 \
-out abies_all.xml

##Blast against high quality pepties

In []:
!$blastx -query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta_decorated.fasta \
-db Z4006_Gene_Prediction/Pabies1.0-high-confidence-pep/Pabies1.0-HC-pep.faa \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 5 \
-out abies_hc.xml

##Process results

In []:
def percent_id(hsp):
    return hsp.ident_num*100./hsp.aln_span

In []:
def query_perc(query_len, query_span):
    return query_span*100./query_len

In []:
def good_hit(res, hsp):
    if query_perc(res.seq_len, hsp.query_span) <= 50:
        return False
    if percent_id(hsp) <= 30:
        return False
    return True

In []:
qresults = SearchIO.parse("abies_hc.xml", "blast-xml")
good_hc_hits = []
for res in qresults:
    for hsp in res.hsps: # get top hit hsp
        if good_hit(res, hsp):
            print hsp
            good_hc_hits.append(hsp.hit_id.split()[0])
            break

In []:
qresults = SearchIO.parse("abies_all.xml", "blast-xml")
good_all_hits = []
for res in qresults:
    for hsp in res.hsps:
        if good_hit(res, hsp):
            print hsp
            good_all_hits.append(hsp.hit_id.split()[0])
            break

In []:
print len(good_all_hits), len(good_hc_hits)

In []:
print len(set(good_all_hits)), len(set(good_hc_hits))

In []:
len(set(good_all_hits).union(set(good_hc_hits)))

In []:
plt.figure(figsize=(5,5))
v = venn2([set(good_hc_hits),set(good_all_hits)], set_labels=["High-quality", "All genes"])
c = venn2_circles([set(good_hc_hits),set(good_all_blasthits)],
                  linestyle="solid",
                  linewidth=0.7)
plt.title("Blastx of 1945 unigenes against all and high quality gene models from P. abies")
plt.show()

In []:
len(set(good_all_hits) - set(good_hc_hits))

In []:
all_ids = !/Users/chris/bnfo/ncbi-blast-2.2.30+/bin/blastdbcmd \
-db Z4006_Gene_Prediction/Pabies1.0-all-pep/Pabies1.0-all-pep.faa \
-entry 'all' \
-outfmt '%o %t %s'

In []:
hc_ids = !/Users/chris/bnfo/ncbi-blast-2.2.30+/bin/blastdbcmd \
-db Z4006_Gene_Prediction/Pabies1.0-high-confidence-pep/Pabies1.0-HC-pep.faa \
-entry 'all' \
-outfmt '%o %t %s'

In []:
hc_titles = [x.split()[1] for x in hc_ids]
all_titles = [x.split()[1] for x in all_ids]

In []:
plt.figure(figsize=(5,5))
v = venn2([set(hc_titles),set(all_titles)], set_labels=["High-quality", "All genes"])
c = venn2_circles([set(hc_titles),set(all_titles)],
                  linestyle="solid",
                  linewidth=0.7)
plt.title("blastdbcmd -entry 'all'")
plt.show()

In []:
hc_title_set = set(hc_titles)
all_title_set = set(all_titles)

In []:
def format_to_fasta(line):
    data = line.split()
    seq = data[-1]
    title = "|".join(data[:-1])
    return ">%s\n%s" % (title, seq)
with open("abies_hc.fasta", "w") as o:
    for elem in hc_ids:
        if elem.split()[1] in good_hc_hits:
            o.write("%s\n" % format_to_fasta(elem))
with open("abies_all.fasta", "w") as o:
    for elem in all_ids:
        if elem.split()[1] in good_all_hits:
            o.write("%s\n" % format_to_fasta(elem))

In []:
!grep -c ">" abies_hc.fasta

In []:
!grep -c ">" abies_all.fasta