## Análise de homologias por BLAST

In [2]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [9]:
files = [("rrnB","dna","blastn","nt"),
         ("ampC","dna","blastn","nt"),
         ("ampC","prot","blastp","nr")]

In [12]:
for file in files:
    print(f"----> E. coli {file[0]} ({file[1]}) BLAST <----\n")
    # --> ler o ficheiro fasta e correr o blast <--
    record = SeqIO.read(f"{file[0]}_{file[1]}.fasta","fasta")
    handle = NCBIWWW.qblast(file[2],file[3],record.seq,
                            entrez_query="all [filter] NOT(txid561[ORGN] OR txid29278[ORGN] "
                            "OR txid32630[ORGN] OR txid77133[ORGN] OR txid2608721[ORGN])")
    # --> guardar os dados recolhidos num ficheiro xml <--
    with open(f"blast_{file[0]}_{file[1]}.xml","w") as out_handle:
        out_handle.write(handle.read())
    handle.close()
    # --> procurar alinhamentos com e-value < 0.0001 e guardar as respetivas sequências<--
    handle = open(f"blast_{file[0]}_{file[1]}.xml")
    blast_records = NCBIXML.parse(handle)
    output = open(f"homol_{file[0]}_{file[1]}.fasta","a")
    e_value_threshold = 0.0001
    counter = 1
    for blast_record in blast_records:
        if blast_record:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < e_value_threshold:
                        print (f"**** Alignment {counter} ****")
                        counter += 1
                        print ("sequence:", alignment.title)
                        print ("length:", alignment.length)
                        print ("e-value:", hsp.expect)
                        print(hsp.query[0:60] + "...")
                        print(hsp.match[0:60] + "...")
                        print(hsp.sbjct[0:60] + "...")
                        print("")
                        output.write(f">{alignment.hit_id}\n{hsp.sbjct}\n\n")
    handle.close()
    output.close()
    print("\n")

----> E. coli rrnB (dna) BLAST <----

**** Alignment 1 ****
sequence: gi|1657675573|gb|CP040347.1| Synthetic Escherichia coli Syn61 chromosome, complete genome
length: 3981202
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...

**** Alignment 2 ****
sequence: gi|1657675573|gb|CP040347.1| Synthetic Escherichia coli Syn61 chromosome, complete genome
length: 3981202
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...

**** Alignment 3 ****
sequence: gi|1657675573|gb|CP040347.1| Synthetic Escherichia coli Syn61 chromosome, complete genome
length: 3981202
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||