## Comparação de variantes dos genes

In [1]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [2]:
files = [("rrnB","dna","blastn","nt"),("ampC","dna","blastn","nt")]

In [5]:
for file in files:
    print(f"----> E. coli {file[0]} ({file[1]}) BLAST <----\n")
    # --> ler o ficheiro fasta e correr o blast <--
    record = SeqIO.read(f"{file[0]}_{file[1]}.fasta","fasta")
    handle = NCBIWWW.qblast(file[2],file[3],record.seq,hitlist_size=100,
                            entrez_query="all [filter] txid562[ORGN]")
    # --> guardar os dados recolhidos num ficheiro xml <--
    with open(f"blast_var_{file[0]}_{file[1]}.xml","w") as out_handle:
        out_handle.write(handle.read())
    handle.close()
    # --> procurar alinhamentos com e-value < 0.0001 e guardar as respetivas sequências<--
    handle = open(f"blast_var_{file[0]}_{file[1]}.xml")
    blast_records = NCBIXML.parse(handle)
    output = open(f"homol_var_{file[0]}_{file[1]}.fasta","a")
    output.write(f">{record.id}\n{record.seq}\n\n") # guardar a sequência de E. coli
    e_value_threshold = 0.0001
    counter = 1
    titles_list = [] # "keep track" dos alignment.titles de modo a evitar nomes duplicados
    for blast_record in blast_records:
        if blast_record:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < e_value_threshold and hsp.query != hsp.sbjct:
                        if alignment.title not in titles_list:
                            titles_list.append(alignment.title)
                            print (f"**** Alignment {counter} ****")
                            counter += 1
                            print ("sequence:", alignment.title)
                            print ("length:", alignment.length)
                            print ("e-value:", hsp.expect)
                            print(hsp.query[0:60] + "...")
                            print(hsp.match[0:60] + "...")
                            print(hsp.sbjct[0:60] + "...")
                            print("")
                            output.write(f">{alignment.title}\n{hsp.sbjct}\n\n")
    handle.close()
    output.close()
    print("\n")

----> E. coli rrnB (dna) BLAST <----

**** Alignment 1 ****
sequence: gi|2177886535|gb|CP061264.1| Escherichia coli strain STEC1024 chromosome, complete genome
length: 5018257
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...

**** Alignment 2 ****
sequence: gi|2177850600|gb|CP060109.1| Escherichia coli strain STEC173 chromosome, complete genome
length: 4886698
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...

**** Alignment 3 ****
sequence: gi|2177844645|gb|CP060107.1| Escherichia coli strain STEC174 chromosome, complete genome
length: 4825942
e-value: 0.0
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||.