In [0]:
from Bio import SwissProt, SeqIO
import gzip

In [0]:
cd ~/gpfs/projects/black_spruce/

In [0]:
def get_accessions(dat_file):
    s = set()
    for rec in SwissProt.parse(open(dat_file)):
       [s.add(x) for x in rec.accessions]
    return s

In [0]:
sprot_accessions = get_accessions("uniprot_sprot_plants.dat")

In [0]:
len(sprot_accessions)

In [0]:
def get_recs(accessions, fastagz):
    recs = []
    for rec in SeqIO.parse(gzip.open(fastagz), "fasta"):
        acc = rec.name.split("|")[1]
        if acc in accessions:
            recs.append(rec)
    return recs

def stream_recs_to_file(accessions, fastagz, out_file):
    with open(out_file, "w") as o:
        for rec in SeqIO.parse(gzip.open(fastagz), "fasta"):
            acc = rec.name.split("|")[1]
            if acc in accessions:
                SeqIO.write(rec, o, "fasta")

In [0]:
sprot_recs = get_recs(sprot_accessions, "uniprot_sprot.fasta.gz")

In [0]:
SeqIO.write(sprot_recs, open("uniprot_sprot_plants.fasta", "w"), "fasta")

```bash
/home/cfriedline/gpfs/src/ncbi-blast-2.2.30+/bin/makeblastdb \
-in uniprot_sprot_plants.fasta \
-dbtype prot \
-parse_seqids 
```

```bash
/home/cfriedline/gpfs/src/ncbi-blast-2.2.30+/bin/blastx \
-db uniprot_sprot_plants.fasta \
-query seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta \
-out all_uniprot_sprot.xml \
-outfmt 5 \
-num_alignments 10 \
-evalue 1e-5 \
-num_threads 20
```


In [0]:
trembl_accessions = get_accessions("uniprot_trembl_plants.dat")

In [0]:
len(trembl_accessions)

In [0]:
trembl_recs = stream_recs_to_file(trembl_accessions, 
                                  "uniprot_trembl.fasta.gz", 
                                  "uniprot_trembl_plants.fasta")