In [15]:
#Import packages here
import pandas as pd
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import concurrent.futures

In [10]:
df = pd.read_csv('microbigge_ecoli.csv')

In [11]:
df.shape

(23295, 18)

In [12]:
df['Type'].value_counts(normalize=True)

Type
AMR          0.504958
VIRULENCE    0.426830
STRESS       0.068212
Name: proportion, dtype: float64

In [16]:
# Define a function to fetch the DNA sequence from the NCBI website, include messages for status updates and if an error arises
def fetch_dna_sequence(contig, start, stop, retries=3, timeout=10):
    url = f"https://www.ncbi.nlm.nih.gov/nuccore/{contig}?from={start}&to={stop}"
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
    return None

In [6]:
# Define a function to extract the nucleotide sequence from the ORIGIN section
def extract_origin_sequence(data):
    if data:
        lines = data.splitlines()
        origin_started = False
        sequence = []
        for line in lines:
            if line.startswith("ORIGIN"):
                origin_started = True
            elif origin_started:
                if line.startswith("//"):
                    break
                sequence.append(''.join(filter(str.isalpha, line)))
        return ''.join(sequence)
    return ""

In [7]:
# Apply the functions to each row in the DataFrame and add a new column for the DNA sequence
def get_dna_sequence(row):
    data = fetch_dna_sequence(row['Contig'], row['Start'], row['Stop'])
    sequence = extract_origin_sequence(data)
    return sequence

In [14]:
def process_row(row):
    return get_dna_sequence(row)

In [17]:
# Apply the function with parallel processing to expedite it
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_row, df.to_dict('records')), total=len(df)))

#Add results of concurrent.futures work to the DataFrame
df['full_sequence'] = results

# Save the result as a new CSV file
df.to_csv('fullseq_microbigge_ecoli.csv', index=False)



  8%|▊         | 1926/23295 [05:40<48:56,  7.28it/s]  

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQG010000001.1?from=202121&to=203542: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)


  8%|▊         | 1926/23295 [09:04<48:56,  7.28it/s]

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQJ010000003.1?from=177596&to=180742: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQC010000001.1?from=105933&to=107354: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQF010000002.1?from=245139&to=246158: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQE010000001.1?from=144181&to=145602: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVPZ010000001.1?from=46840&to=47373: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAVVQA010000001

 13%|█▎        | 3033/23295 [12:32<46:45,  7.22it/s]    

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/NZ_JVFI01000002.1?from=41753&to=43645: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /nuccore/NZ_JVFI01000002.1?from=41753&to=43645 (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_DECODE_ERROR] tlsv1 alert decode error (_ssl.c:1122)')))


 25%|██▌       | 5889/23295 [20:49<1:39:35,  2.91it/s]

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATDFU010000002.1?from=98314&to=99447: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATDFT010000003.1?from=235993&to=237012: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AAYQRO010000001.1?from=147685&to=148338: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATDFX010000001.1?from=50442&to=50975: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATDFV010000015.1?from=30564&to=31697: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATDGA010000005.1?f

 40%|████      | 9358/23295 [59:14<39:27,  5.89it/s]    

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEPK010000001.1?from=22502&to=27067: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEOS010000001.1?from=214798&to=219051: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEOY010000001.1?from=150446&to=150712: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEOX010000001.1?from=137365&to=138723: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEPC010000006.1?from=70338&to=72050: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AATEPE010000001.1

 50%|█████     | 11664/23295 [1:06:35<1:25:04,  2.28it/s]

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/ABDCBI010000003.1?from=17609&to=18967: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out.


 54%|█████▍    | 12621/23295 [1:12:25<21:02,  8.46it/s]   

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPS010000007.1?from=112317&to=115421: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPQ010000003.1?from=227&to=880: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPP010000001.1?from=256778&to=258136: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPR010000001.1?from=318429&to=319787: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPN010000004.1?from=154921&to=155493: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUPK010000010.1?f

 56%|█████▌    | 13006/23295 [1:13:39<47:59,  3.57it/s]   

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASUBX010000001.1?from=158812&to=159144: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nuccore/AASUBX010000001.1?from=158812&to=159144


 69%|██████▉   | 16078/23295 [1:23:05<19:38,  6.13it/s]  

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJL010000002.1?from=270624&to=273251: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJJ010000001.1?from=37186&to=38988: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJG010000001.1?from=289279&to=290286: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJI010000001.1?from=131136&to=133763: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJF010000001.1?from=325501&to=330057: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
Error fetching https://www.ncbi.nlm.nih.gov/nuccore/DABIJE010000001

 84%|████████▍ | 19676/23295 [1:39:10<07:53,  7.64it/s]   

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASNKJ010000001.1?from=90976&to=92397: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 84%|████████▍ | 19677/23295 [1:39:40<6:04:37,  6.05s/it]

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASNKK010000001.1?from=268757&to=270115: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)


 84%|████████▍ | 19678/23295 [1:39:42<5:08:21,  5.12s/it]

Error fetching https://www.ncbi.nlm.nih.gov/nuccore/AASNKD010000002.1?from=90981&to=92402: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)


100%|██████████| 23295/23295 [1:50:20<00:00,  3.52it/s]  
