In [None]:
# from Bio import SeqIO
# import re
# import csv

# # Path to your downloaded bat nucleotide FASTA file
# fasta_file = "Bat-associated_viruses_nucleotide.fasta"

# # Output CSV file where the accession number and virus name will be saved
# output_file = "accession_and_virus_names.csv"

# # Open the CSV file for writing
# with open(output_file, "w", newline="") as csvfile:
#     writer = csv.writer(csvfile)
#     # Write the header row
#     writer.writerow(["Accession", "Virus Name"])
    
#     # Parse the FASTA file and extract the data
#     for record in SeqIO.parse(fasta_file, "fasta"):
#         header = record.description
        
#         # Extract GenBank accession number using regex (e.g., KX871230)
#         gb_match = re.search(r'gb\|([A-Z0-9_]+)\|', header)
#         accession = gb_match.group(1) if gb_match else None
        
#         # Extract virus name (e.g., "Bat adenovirus isolate 250-A")
#         virus_name = header.split("|")[-1].split("(")[0].strip()
        
#         # Write the extracted data to the CSV file
#         writer.writerow([accession, virus_name])


In [10]:
from Bio import Entrez, SeqIO
from datetime import datetime
import csv
import time

# Always provide your email to NCBI Entrez
Entrez.email = "your_email_id"  # Replace with your email

years_threshold = 10  # Define filtering criteria (last 8 years)
current_year = datetime.now().year

def get_metadata(accession):
    """
    Retrieve metadata for a given accession number.
    Returns:
      - pub_date: Publication date as a string (if available)
      - host: Host organism (if available)
    """
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        
        pub_date = record.annotations.get('date', None)
        host = None
        
        for feature in record.features:
            if feature.type == "source":
                host = feature.qualifiers.get("host", [None])[0]
                break
        
        return pub_date, host
    except Exception as e:
        print(f"Error retrieving metadata for {accession}: {e}")
        return None, None

def get_taxonomy_id(accession):
    """Fetch taxonomy ID for the given GenBank accession number."""
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        
        for feature in record.features:
            if feature.type == "source":
                for xref in feature.qualifiers.get("db_xref", []):
                    if xref.startswith("taxon:"):
                        return xref.split(":")[1]
        return "Not found"
    except Exception as e:
        print(f"Error retrieving taxonomy ID for {accession}: {e}")
        return "Error"

# Input and output files
input_csv = "sub-data.csv"  # Input file
output_csv = "combined_output.csv"  # Output file

with open(input_csv, "r") as infile, open(output_csv, "w", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = ["Accession", "Virus Name", "Publication Date", "Host", "Taxonomy ID"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        accession = row["Accession"]
        virus_name = row["Virus Name"]
        
        # Fetch metadata
        pub_date, host = get_metadata(accession)
        
        # Parse publication date and check threshold
        within_threshold = False
        if pub_date:
            try:
                try:
                    date_obj = datetime.strptime(pub_date, "%d-%b-%Y")
                except:
                    try:
                        date_obj = datetime.strptime(pub_date, "%Y-%m-%d")
                    except:
                        date_obj = datetime.strptime(pub_date, "%Y")
                
                if current_year - date_obj.year <= years_threshold:
                    within_threshold = True
            except Exception as e:
                print(f"Error parsing publication date for {accession}: {e}")
                
        # Fetch taxonomy ID
        tax_id = get_taxonomy_id(accession)
        
        # Save only if within threshold
        if within_threshold:
            writer.writerow({
                "Accession": accession,
                "Virus Name": virus_name,
                "Publication Date": pub_date,
                "Host": host,
                "Taxonomy ID": tax_id
            })
        time.sleep(1)  # Comply with NCBI rate limits


In [11]:
from Bio import Entrez
import pandas as pd
import time
import requests
from tqdm import tqdm
Entrez.email = "your_email_id"  # Use your email

def is_zoonotic(tax_id):
    """Check if the virus has human host records in GenBank."""
    try:
        # Search for sequences from this virus in humans
        query = f"txid{tax_id}[Organism] AND Homo sapiens[Host]"
        handle = Entrez.esearch(db="nucleotide", term=query, retmax=1)
        result = Entrez.read(handle)
        return int(result["Count"]) > 0  # True if human-associated sequences exist
    except Exception as e:
        print(f"Error for tax ID {tax_id}: {e}")
        return False

# Load your dataset (with Taxonomy IDs)
df = pd.read_csv("combined_output.csv")

# Check zoonotic potential for each virus
zoonotic_flags = []
for tax_id in tqdm(df["Taxonomy ID"], desc="Checking zoonotic potential"):
    if tax_id in ["Not found", "Error"]:
        zoonotic_flags.append(False)
    else:
        zoonotic_flags.append(is_zoonotic(tax_id))
    time.sleep(1)  # Rate limiting

# Add results to DataFrame
df["Zoonotic Potential"] = zoonotic_flags

# Save results
df.to_csv("zoonotic_assessment.csv", index=False)

Checking zoonotic potential: 100%|██████████| 26/26 [00:48<00:00,  1.87s/it]
