In [1]:
#Import packages here
import requests
from tqdm import tqdm
import pandas as pd
import concurrent.futures
from bs4 import BeautifulSoup

In [2]:
df = pd.read_csv('microbigge_ecoli.csv')

In [3]:
df.shape

(23295, 18)

In [4]:
df['Type'].value_counts(normalize=True)

Type
AMR          0.504958
VIRULENCE    0.426830
STRESS       0.068212
Name: proportion, dtype: float64

In [5]:
# Define a function to fetch the DNA sequence from the NCBI website
def fetch_dna_sequence(contig, start, stop, retries=3, timeout=10):
    url = f"https://www.ncbi.nlm.nih.gov/nuccore/{contig}?from={start}&to={stop}"
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
    return None

In [6]:
# Define a function to extract the nucleotide sequence from the ORIGIN section
def extract_origin_sequence(data):
    if data:
        soup = BeautifulSoup(data, 'html.parser')
        pre_tag = soup.find('pre', class_='genbank')
        if pre_tag:
            text = pre_tag.get_text()
            lines = text.splitlines()
            origin_started = False
            sequence = []
            for line in lines:
                if line.startswith("ORIGIN"):
                    origin_started = True
                elif origin_started:
                    if line.startswith("//"):
                        break
                    sequence.append(''.join(filter(str.isalpha, line)))
            return ''.join(sequence)
    return ""

In [7]:
# Process each row to get DNA sequence
def get_dna_sequence(row):
    print(f"Processing row: Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
    data = fetch_dna_sequence(row['Contig'], row['Start'], row['Stop'])
    if data:
        sequence = extract_origin_sequence(data)
        if sequence:
            print(f"Fetched sequence: {sequence[:50]}...")  # Print the first 50 characters of the sequence
            return sequence
        else:
            print(f"Failed to extract sequence for Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
            print(data)  # Print the full data for debugging
    else:
        print(f"Failed to fetch data for Contig={row['Contig']}, Start={row['Start']}, Stop={row['Stop']}")
    return ""

In [8]:
#Iterate through each row and get the DNA sequence for each row
def process_row(row):
    return get_dna_sequence(row)

In [9]:
# Apply the function with parallel processing to save time
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_row, df.to_dict('records')), total=len(df)))

df['full_sequence'] = results

# Save as a new CSV file
df.to_csv('fullseq_microbigge_ecoli.csv', index=False)

print("Processing complete.")

Processing row: Contig=DABDQO010000001.1, Start=24288, Stop=24620
Processing row: Contig=DABDQN010000004.1, Start=107535, Stop=108668
Processing row: Contig=DABDQR010000004.1, Start=132370, Stop=132702
Processing row: Contig=DABDQQ010000021.1, Start=60473, Stop=60907
Processing row: Contig=DABDQS010000004.1, Start=108137, Stop=108403
Processing row: Contig=DABDGS010000001.1, Start=198172, Stop=199530
Processing row: Contig=DABDGT010000001.1, Start=50148, Stop=51860
Processing row: Contig=DABDGY010000002.1, Start=355079, Stop=356437
Processing row: Contig=DABDGU010000001.1, Start=282807, Stop=284165
Processing row: Contig=DABDGW010000001.1, Start=210153, Stop=210419


  0%|          | 1/23295 [00:00<6:05:17,  1.06it/s]

Failed to extract sequence for Contig=DABDQQ010000021.1, Start=60473, Stop=60907
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE89416469E0E231_1436SI

  0%|          | 11/23295 [00:02<1:14:18,  5.22it/s]

Failed to extract sequence for Contig=DABDGX010000005.1, Start=176446, Stop=178248
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8ACC2F69E0E251_1606

  0%|          | 12/23295 [00:02<1:24:17,  4.60it/s]

Failed to extract sequence for Contig=DABDHN010000002.1, Start=138633, Stop=140054
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE893A4D69E0E251_1477

  0%|          | 21/23295 [00:03<1:01:44,  6.28it/s]

Failed to extract sequence for Contig=DABDHL010000001.1, Start=219816, Stop=221174
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8B43EA69E0E261_0016

  0%|          | 23/23295 [00:04<1:05:04,  5.96it/s]

Failed to extract sequence for Contig=DABDHC010000001.1, Start=269657, Stop=271015
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8B45C569E0E261_0003

  0%|          | 24/23295 [00:04<1:14:09,  5.23it/s]

Failed to extract sequence for Contig=DABDHS010000001.1, Start=298447, Stop=298713
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8800B269E0E271_1320

  0%|          | 31/23295 [00:05<1:00:52,  6.37it/s]

Failed to extract sequence for Contig=DABDHH010000001.1, Start=250939, Stop=252297
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE88F77369E0E281_1328

  0%|          | 34/23295 [00:06<59:58,  6.46it/s]  

Failed to extract sequence for Contig=DABDHP010000001.1, Start=238929, Stop=240287
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8C149B69E0E281_1471

  0%|          | 38/23295 [00:06<43:49,  8.84it/s]

Failed to extract sequence for Contig=AATJRJ010000003.1, Start=185964, Stop=190217
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE89436769E0E281_1435

  0%|          | 40/23295 [00:07<1:10:54,  5.47it/s]

Failed to extract sequence for Contig=ABDCZV010000002.1, Start=51099, Stop=52232
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE8AD6DA69E0E281_1591SI

  0%|          | 41/23295 [00:08<1:51:44,  3.47it/s]

Failed to extract sequence for Contig=AATJRF010000002.1, Start=116206, Stop=117564
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <!-- meta -->
    <meta name="robots" content="index,nofollow,noarchive" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="nuccore" /><meta name="ncbi_report" content="genbank" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="retrieve" /><meta name="ncbi_pdid" content="genbank" /><meta name="ncbi_sessionid" content="CE88071F69E0E2A1_1307