In [None]:
#Install these packages in Google Colab because they're not already installed
#!pip install requests biopython


In [None]:
#Import packages here
import requests
from tqdm import tqdm
import pandas as pd
import concurrent.futures
from Bio import Entrez
import re

In [None]:
#Read dataset here (E. coli here as an example)
df = pd.read_csv('/content/microbigge_ecoli.csv')

In [None]:
# Set email for Entrez
Entrez.email = "write your email address here"

In [None]:
# Fetch DNA sequence from NCBI using Entrez
def fetch_dna_sequence(entrez_id, start, stop):
    try:
        handle = Entrez.efetch(db="nuccore", id=entrez_id, rettype="gb", retmode="text")
        record = handle.read()
        handle.close()

        # Find the ORIGIN section and extract the sequence
        origin_start = record.find("ORIGIN")
        if origin_start != -1:
            origin_section = record[origin_start:]
            origin_end = origin_section.find("//")
            if origin_end != -1:
                sequence = origin_section[:origin_end].replace("ORIGIN", "").replace(" ", "").replace("\n", "").replace("\r", "")

                # Keep only letters (A, T, C, G) (However, they must be in LOWER case in order for the code to work)
                sequence = re.sub(r'[^atcg]', '', sequence)
                return sequence
        return ""
    except Exception as e:
        print(f"Error fetching or parsing sequence for ID={entrez_id}: {e}")
        return ""

In [None]:
# Process each row to get DNA sequence
def get_dna_sequence(row):
    print(f"Processing row: Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
    sequence = fetch_dna_sequence(row['Contig'], row['Start'], row['Stop'])
    if sequence:
        print(f"Fetched sequence: {sequence[:50]}...")  # Print the first 50 characters of the sequence
        return sequence
    else:
        print(f"Error: Failed to fetch sequence for Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
        return ""


In [None]:
#Iterate through each row and get the DNA sequence for each row
def process_row(row):
    return get_dna_sequence(row)

In [None]:
# Apply the function with parallel processing to save time
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_row, df.to_dict('records')), total=len(df)))

df['full_sequence'] = results

# Save as a new CSV file
df.to_csv('fullseq_microbigge_ecoli.csv', index=False)

print("Processing complete.")