In [1]:
#Import packages here
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import concurrent.futures
import re

In [2]:
df = pd.read_csv('microbigge_ecoli.csv')

In [3]:
df.shape

(23295, 18)

In [4]:
df['Type'].value_counts(normalize=True)

Type
AMR          0.504958
VIRULENCE    0.426830
STRESS       0.068212
Name: proportion, dtype: float64

In [5]:
# Define a function to fetch the DNA sequence from the NCBI website
def fetch_dna_sequence(contig, start, stop, retries=3, timeout=10):
    url = f"https://www.ncbi.nlm.nih.gov/nuccore/{contig}?from={start}&to={stop}"
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
    return None

In [6]:
# Define a function to extract the nucleotide sequence from the ORIGIN section
def extract_origin_sequence(data):
    if data:
        lines = data.splitlines()
        origin_started = False
        sequence = []
        for line in lines:
            if line.startswith("ORIGIN"):
                origin_started = True
            elif origin_started:
                if line.startswith("//"):
                    break
                sequence.append(''.join(filter(str.isalpha, line)))
        return ''.join(sequence)
    return ""

In [8]:
# Process each row to get DNA sequence
def get_dna_sequence(row):
    print(f"Processing row: Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
    data = fetch_dna_sequence(row['Contig'], row['Start'], row['Stop'])
    if data:
        sequence = extract_origin_sequence(data)
        if sequence:
            print(f"Fetched sequence: {sequence[:50]}...")  # Print the first 50 nucleotides of the sequence as a sanity check
            return sequence
        else:
            print(f"Failed to extract sequence for Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
            print(data)  # Print the full data for debugging
    else:
        print(f"Failed to fetch data for Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
    return ""

In [9]:
#Iterate through each row and get the DNA sequence for each row
def process_row(row):
    return get_dna_sequence(row)

In [None]:
# Apply the function with parallel processing to save time
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_row, df.to_dict('records')), total=len(df)))

df['full_sequence'] = results

# Save as a new CSV file
df.to_csv('fullseq_microbigge_ecoli.csv', index=False)

print("Processing complete.")