In [23]:
# IMPORT DEPENDENCIES
import pandas as pd
from Bio import Entrez, SeqIO

Entrez.email = ""
sequences = []

In [24]:
# FETCHING E COLI SEQUENCES
handle = Entrez.esearch(
    db="nucleotide",
    term="Escherichia coli[Organism] AND gene",
    retmax=200
)

record = Entrez.read(handle)
ecoli_ids = record["IdList"]

# retrieving ecoli sequences
for seq_id in ecoli_ids:
    handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
    try:
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append((seq_record, "Escherichia coli"))
    except:
        print(f"Error fetching record {seq_id}")

Error fetching record 1741143868


In [25]:
# FETCHING STREP SEQUENCES
handle = Entrez.esearch(
    db="nucleotide",
    term="Streptomyces coelicolor[Organism] AND gene",
    retmax=200
)

record = Entrez.read(handle)
strep_ids = record["IdList"]

# retrieving strep sequences
for seq_id in strep_ids:
    handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
    try:
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append((seq_record, "Streptomyces coelicolor"))
    except:
        print(f"Error fetching record {seq_id}")

Error fetching record 1852346650
Error fetching record 1778705913
Error fetching record 1778704861


In [26]:
# FETCHING Bacillus subtilis SEQUENCES
handle = Entrez.esearch(db="nucleotide", term="Bacillus subtilis[Organism] AND gene", retmax=200)

record = Entrez.read(handle)
bacillus_ids = record["IdList"]

for seq_id in bacillus_ids:
    handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
    try:
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append((seq_record, "Bacillus subtilis"))
    except:
        print(f"Error fetching record {seq_id}")

Error fetching record 2810276184
Error fetching record 2810266711
Error fetching record 2810266392
Error fetching record 2810266233
Error fetching record 2810261109
Error fetching record 2810260547


In [27]:
handle = Entrez.esearch(db="nucleotide", term="Mycobacterium tuberculosis[Organism] AND gene", retmax=200)

record = Entrez.read(handle)
tuberculosis_ids = record["IdList"]

for seq_id in tuberculosis_ids:
    handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text")
    try:
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append((seq_record, "Mycobacterium tuberculosis"))
    except:
        print(f"Error fetching record {seq_id}")

Error fetching record 1268721759
Error fetching record 1000884109
Error fetching record 950547449


In [28]:
# Converting sequences into dataframes
data = []

for s in sequences:
    seq_str = str(s[0].seq).upper()
    length = len(seq_str)
    if length > 0:
        counts = {nuc: seq_str.count(nuc) / float(length) for nuc in "ATCG"}
        gc = counts["G"] + counts["C"]
        at = counts["A"] + counts["T"]
        at_gc_ratio = at / gc if gc > 0 else 0
        data.append({"id": s[0].id, "gc_content": gc, "a_freq": counts["A"], "c_freq": counts["C"], "g_freq": counts["G"], "t_freq": counts["T"], "at_gc_ratio": at_gc_ratio, "seq_len": length, "label": s[1]})

df = pd.DataFrame(data)
df.to_csv("sequences.csv", index=False)