Esta é uma script em jupyter para fazer pequenas funções e depois passar para a script em python.

In [48]:
#Dar import a packages
import os
from Bio import SeqIO, SeqFeature, Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIXML, NCBIWWW 

### Análise da sequência e das features presentes no NCBI

Funtion to get gene genbank file

In [51]:
def get_seq(gene_id, email="...@gmail.com", filename="sequence.gb", output_dir="output"):
    """
    Fetches a sequence from NCBI and saves it to a file in the specified output directory.
    
    Args:
        gene_id (str): NCBI Gene ID.
        email (str): User's email for NCBI access.
        filename (str): File name to save the sequence.
        output_dir (str): Directory where the file will be saved.
    """
    Entrez.email = email
    os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist
    output_path = os.path.join(output_dir, filename)  # Full path to the output file

    try:
        print(f"Fetching data for gene ID: {gene_id}...")
        handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        print("Saving the sequence to file...")
        with open(output_path, "w") as output_file:  # Use output_path here
            SeqIO.write(record, output_file, "genbank")
        print(f"Sequence saved successfully to {output_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [55]:
get_seq("U10926.1", filename = "comS.gb", output_dir="genes_gb")

Fetching data for gene ID: U10926.1...
Saving the sequence to file...
Sequence saved successfully to genes_gb\comS.gb.


Funtion de análise de features

In [62]:
def analyse_features(filename):

    record = SeqIO.read(f"genes_gb/{filename}", "genbank")

    # Extracting features
    for feature in record.features:
        print(f"Type: {feature.type}")
        print(f"Location: {feature.location}")
        print(f"Qualifiers: {feature.qualifiers}")

In [63]:
analyse_features("comS.gb")

Type: source
Location: [0:570](+)
Qualifiers: {'organism': ['Bacillus subtilis subsp. subtilis str. 168'], 'mol_type': ['genomic DNA'], 'strain': ['168'], 'sub_species': ['subtilis'], 'type_material': ['type strain of Bacillus subtilis'], 'db_xref': ['taxon:224308'], 'map': ['30 degrees'], 'note': ['sequence similar to srfA2 gene, GenBank Accession Number X70356, and srfAB gene, GenBank Accession Number D13262']}
Type: regulatory
Location: [188:189](+)
Qualifiers: {'regulatory_class': ['ribosome_binding_site']}
Type: gene
Location: [210:351](+)
Qualifiers: {'gene': ['comS']}
Type: CDS
Location: [210:351](+)
Qualifiers: {'gene': ['comS'], 'function': ['regulation of genetic competence'], 'experiment': ['experimental evidence, no additional details recorded'], 'codon_start': ['1'], 'transl_table': ['11'], 'organism': ['Bacillus subtilis'], 'product': ['ComS'], 'protein_id': ['AAA61567.1'], 'translation': ['MNRSGKHLISSIILYPRPSGECISSISLDKQTQATTSPLYFCWREK']}


Function to get the annotation from genbank file

In [65]:
def extract_annotations(filename):
    """
    Extracts annotations from a GenBank file.
    
    Args:
        filename (str): Path to the GenBank file.
    """
    try:
        record = SeqIO.read(f"genes_gb/{filename}", "genbank")
        print(f"Gene Description: {record.description}")
        print(f"Organism: {record.annotations.get('organism', 'Unknown')}")
        print(f"Annotations: {record.annotations}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [66]:
extract_annotations("comS.gb")

Gene Description: Bacillus subtilis 168 genetic competence regulation (comS) gene, complete cds
Organism: Bacillus subtilis subsp. subtilis str. 168
Annotations: {'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'BCT', 'date': '26-JAN-1995', 'accessions': ['U10926'], 'sequence_version': 1, 'keywords': [''], 'source': 'Bacillus subtilis subsp. subtilis str. 168', 'organism': 'Bacillus subtilis subsp. subtilis str. 168', 'taxonomy': ['Bacteria', 'Bacillati', 'Bacillota', 'Bacilli', 'Bacillales', 'Bacillaceae', 'Bacillus'], 'references': [Reference(title='Identification of comS, a gene of the srfA operon that regulates the establishment of genetic competence in Bacillus subtilis', ...), Reference(title="Nucleotide sequence of 5' portion of srfA that contains the region required for competence establishment in Bacillus subtilus", ...), Reference(title='Direct Submission', ...)]}


Funtion to analyse features and qualifiers

In [69]:
def analyze_features(filename):
    """
    Analyzes features and qualifiers in a GenBank file.
    
    Args:
        filename (str): Path to the GenBank file.
    """
    try:
        record = SeqIO.read(f"genes_gb/{filename}", "genbank")
        print(f"Number of Features: {len(record.features)}")
        for feature in record.features:
            print(f"Type: {feature.type}")
            print(f"Location: {feature.location}")
            print(f"Qualifiers: {feature.qualifiers}")
            print("-" * 50)
    except Exception as e:
        print(f"An error occurred: {e}")


In [70]:
analyze_features("comS.gb")

Number of Features: 4
Type: source
Location: [0:570](+)
Qualifiers: {'organism': ['Bacillus subtilis subsp. subtilis str. 168'], 'mol_type': ['genomic DNA'], 'strain': ['168'], 'sub_species': ['subtilis'], 'type_material': ['type strain of Bacillus subtilis'], 'db_xref': ['taxon:224308'], 'map': ['30 degrees'], 'note': ['sequence similar to srfA2 gene, GenBank Accession Number X70356, and srfAB gene, GenBank Accession Number D13262']}
--------------------------------------------------
Type: regulatory
Location: [188:189](+)
Qualifiers: {'regulatory_class': ['ribosome_binding_site']}
--------------------------------------------------
Type: gene
Location: [210:351](+)
Qualifiers: {'gene': ['comS']}
--------------------------------------------------
Type: CDS
Location: [210:351](+)
Qualifiers: {'gene': ['comS'], 'function': ['regulation of genetic competence'], 'experiment': ['experimental evidence, no additional details recorded'], 'codon_start': ['1'], 'transl_table': ['11'], 'organism'

Funtion to extract external references

In [71]:
def extract_external_references(filename):
    """
    Extracts external database references from a GenBank file.
    
    Args:
        filename (str): Path to the GenBank file.
    """
    try:
        record = SeqIO.read(f"genes_gb/{filename}", "genbank")
        external_refs = []
        for feature in record.features:
            if "db_xref" in feature.qualifiers:
                external_refs.extend(feature.qualifiers["db_xref"])
        print("External References:")
        for ref in set(external_refs):
            print(ref)
    except Exception as e:
        print(f"An error occurred: {e}")



In [72]:
extract_external_references("comS.gb")

External References:
taxon:224308


### Análise de homologias por BLAST

Função para fazer um NCBI blast apartir de um file (pode-se escolher o tipo de blast, database e outros parâmetros)

In [198]:
def blast(file_name, file_format = "fasta", program = "blastn", database = "nt", e_value = 0.05, hitlist_size = 100):
    """
    Realiza o BLAST de uma sequência contida em um arquivo.
    
    Parameters:
        file_name (str): Nome do arquivo contendo a sequência.
        file_format (str): Formato do arquivo (padrão: 'fasta').
        program (str): Programa BLAST a ser usado (padrão: 'blastn').
        database (str): Banco de dados para busca (padrão: 'nt').
        e_value (float): Limite de valor E para busca (padrão: 0.05).
        hitlist_size (int): Número de hits a serem retornados (padrão: 50).
    
    Returns:
        Um handle com os resultados do BLAST, ou None em caso de erro.
    """
    
    try:
        #lê o arquivo
        record = SeqIO.read(open(f"genes_gb/{file_name}"), format = file_format)

        #Começa o blast remoto
        print("BLASTing...")
        result_handle = NCBIWWW.qblast(program, database, record.format("fasta"), expect = e_value, hitlist_size = hitlist_size)

        #returns result handle
        print("BLAST concluído com sucesso.")
        return result_handle
    
    except Exception as e:
        print(f"Erro ao executar o BLAST: {e}")
        return None

Função que utiliza função blast, mas guardar o output num file.

In [203]:
def get_blast(file_name, output_name = "blast_result", file_format = "gb", program = "blastn", database = "nt", e_value = 0.01, hitlist_size = 100):
    """
    Realiza o BLAST usando a função `blast` e salva os resultados em um arquivo.
    
    Parameters:
        file_name (str): Nome do arquivo contendo a sequência.
        output_name (str): Nome do arquivo para salvar os resultados.
        Outros parâmetros são os mesmos da função `blast`.
    """

    result_handle = blast(file_name, file_format, program, database, e_value, hitlist_size)

    if result_handle is not None:
        try:
            # Salva os resultados em um arquivo XML
            with open(f"{output_name}.xml", "w") as save_file:
                save_file.write(result_handle.read())
                print(f"Resultados salvos em {output_name}.xml.")

        except Exception as e:
            print(f"Erro ao salvar os resultados: {e}")

        finally:
            result_handle.close()
    

Teste da função *get_blast* e *blast*

In [204]:
get_blast("comS.gb", output_name ="comS", hitlist_size=10000)

BLASTing...
BLAST concluído com sucesso.
Resultados salvos em comS.xml.


Function to parse a blast result

In [211]:
def parse_blast_results(file_name, exclude=None, e_value_thresh=0.05, identity_thresh=90):
    """
    Parses BLAST results and extracts significant hits, excluding specific species or TaxIDs.
    
    Args:
        file_name (str): Path to the BLAST result file in XML format.
        exclude_taxid (int or None): TaxID of the species to exclude.
        exclude_species (str or None): Species name to exclude (e.g., "Bacillus subtilis").
        e_value_thresh (float): Threshold for E-value significance.
        identity_thresh (float): Minimum percentage identity for significant hits.
    
    Returns:
        list: A list of dictionaries containing significant hit information.
    """
    from Bio.Blast import NCBIXML

    with open(file_name) as result_handle:
        blast_record = NCBIXML.read(result_handle)
    
    significant_hits = []
    for alignment in blast_record.alignments:
        EXC = False
        if exclude:
            for _ in exclude:
                if _.lower() in alignment.title.lower():
                    EXC = True
                    break  # Exit the loop once a match is found
        
        if not EXC:
            for hsp in alignment.hsps:
                if hsp.expect < e_value_thresh and (hsp.identities / hsp.align_length) * 100 > identity_thresh:
                    hit_info = {
                        "title": alignment.title,
                        "length": alignment.length,
                        "e_value": hsp.expect,
                        "identity": (hsp.identities / hsp.align_length) * 100,
                        "alignment_length": hsp.align_length
                    }
                    significant_hits.append(hit_info)
                    
    
    return significant_hits

In [216]:
parse_blast_results("comS.xml", exclude=["subtilis","chromosome","complete", "genome"], e_value_thresh=0.05, identity_thresh=50)

[{'title': 'gi|1257575185|dbj|LC171348.1| Bacillus sp. FW1 genes for pyrene metabolism, contig_7',
  'length': 455392,
  'e_value': 0.0,
  'identity': 95.08771929824562,
  'alignment_length': 570},
 {'title': 'gi|1757450143|gb|MK570508.1| Bacillus amyloliquefaciens strain TSBSO3.8 surfactin gene region',
  'length': 65411,
  'e_value': 4.41789e-127,
  'identity': 78.34507042253522,
  'alignment_length': 568},
 {'title': 'gi|1757450197|gb|MK570509.1| Bacillus amyloliquefaciens strain H2O-1 surfactin gene region',
  'length': 65415,
  'e_value': 4.41789e-127,
  'identity': 78.34507042253522,
  'alignment_length': 568},
 {'title': 'gi|42820782|emb|AJ575642.1| Bacillus amyloliquefaciens yciC gene, yx01 gene, yckc gene, yckD gene, yckE gene, nin gene, nuc gene, hxlB gene, hxlA gene, hxlR gene, xy02 gene, srfAA gene, srfAB gene, comS gene, srfAC gene, srfAD gene, aat gene, ycxc gene, ycxD gene, sfp gene, yczE gene, yckI gene and yckJ gene',
  'length': 41884,
  'e_value': 4.41789e-127,
  'id