In [1]:
import csv
import time
from Bio import Entrez
from Bio import SeqIO

In [2]:
csv_file_0 = '../data/raw/articles/ABC_ATPase_PARIS.csv'
output_fasta_0 = '../data/processed/fasta_sequences/ABC_ATPase_PARIS.fasta'

csv_file_1 = '../data/raw/articles/AAA_ATPase_15-21_family.csv'
output_fasta_1 = '../data/processed/fasta_sequences/AAA_ATPase_15-21_family.fasta'

In [3]:
Entrez.email = "alyona.koshkareva@gmail.com"
BATCH_SIZE = 100 
DELAY_BETWEEN_BATCHES = 5  
DELAY_BETWEEN_SEQUENCES = 0.1 

def fetch_protein_sequences(csv_file, output_fasta):
    # Step 1: Read and Clear ID
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        raw_ids = [row[2].strip() for row in reader if len(row) > 2]
    
    # ID filtering (ignore only lines starting with #)
    protein_ids = list({
        pid for pid in raw_ids 
        if not pid.startswith('#')
    })

    print(f"Found {len(protein_ids)} ID. Examples: {protein_ids[:5]}...")

    # Step 2: Download from the database
    records = []
    for i in range(0, len(protein_ids), BATCH_SIZE):
        batch = protein_ids[i:i + BATCH_SIZE]
        print(f"Loading the batch {i//BATCH_SIZE + 1}")

        for protein_id in batch:
            try:
                handle = Entrez.efetch(
                    db="protein",
                    id=protein_id, 
                    rettype="fasta",
                    retmode="text"
                )
                record = next(SeqIO.parse(handle, "fasta"))
                records.append(record)
                handle.close()
                time.sleep(DELAY_BETWEEN_SEQUENCES)
            
            except Exception as e:
                print(f"Error loading {protein_id}: {str(e)}")
                continue
        
        time.sleep(DELAY_BETWEEN_BATCHES)

    # Step 3: Save the results
    if records:
        with open(output_fasta, 'w') as output:
            SeqIO.write(records, output, "fasta")
        print(f"Total saved {len(records)} protein sequences.")
    else:
        print("Couldn't load any sequences.")


In [4]:
fetch_protein_sequences(csv_file_0, output_fasta_0)

fetch_protein_sequences(csv_file_1, output_fasta_1)


Found 456 ID. Examples: ['WP_143237877.1', 'WP_124340352.1', 'WP_232077070.1', 'WP_007656013.1', 'WP_084545522.1']...
Loading the batch 1
Loading the batch 2
Loading the batch 3
Loading the batch 4
Loading the batch 5
Total saved 456 protein sequences.
Found 262 ID. Examples: ['WP_181825233.1', 'WP_088720954.1', 'WP_000908413.1', 'WP_193827114.1', 'WP_114247160.1']...
Loading the batch 1
Loading the batch 2
Error loading 2562294790: HTTP Error 400: Bad Request
Loading the batch 3
Total saved 261 protein sequences.
