In [3]:
from Bio import SeqIO
import re
import csv

# Path to your downloaded bat nucleotide FASTA file
fasta_file = "Bat-associated_viruses_nucleotide.fasta"

# Output CSV file where the accession number and virus name will be saved
output_file = "accession_and_virus_names.csv"

# Open the CSV file for writing
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    # Write the header row
    writer.writerow(["Accession", "Virus Name"])
    
    # Parse the FASTA file and extract the data
    for record in SeqIO.parse(fasta_file, "fasta"):
        header = record.description
        
        # Extract GenBank accession number using regex (e.g., KX871230)
        gb_match = re.search(r'gb\|([A-Z0-9_]+)\|', header)
        accession = gb_match.group(1) if gb_match else None
        
        # Extract virus name (e.g., "Bat adenovirus isolate 250-A")
        virus_name = header.split("|")[-1].split("(")[0].strip()
        
        # Write the extracted data to the CSV file
        writer.writerow([accession, virus_name])


In [None]:
from Bio import Entrez, SeqIO
from datetime import datetime

# Always provide your email to NCBI Entrez
Entrez.email = "ayush23160@iiitd.ac.in"

def get_metadata(accession):
    """
    Retrieve metadata for a given accession number.
    Returns:
      - pub_date: Publication date as a string (if available)
      - host: Host organism (if available)
    """
    try:
        # Retrieve the GenBank record for the accession
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        
        # Extract publication date (often found in record.annotations)
        pub_date = record.annotations.get('date', None)
        
        # Extract host from the 'source' feature qualifiers
        host = None
        for feature in record.features:
            if feature.type == "source":
                host = feature.qualifiers.get("host", [None])[0]
                break
        
        return pub_date, host
    except Exception as e:
        print(f"Error retrieving metadata for {accession}: {e}")
        return None, None

# Example usage:
# pub_date, host = get_metadata("KX871230")
# print("Publication Date:", pub_date, "Host:", host)


In [None]:
import csv

input_csv = "accession_and_virus_names.csv"
output_csv = "filtered_metadata.csv"

# Define the filtering criteria
# For example, keep records from the last 8 years (adjust as needed)
years_threshold = 8
current_year = datetime.now().year

with open(input_csv, "r") as infile, open(output_csv, "w", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = ["Accession", "Virus Name", "Publication Date", "Host"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        accession = row["Accession"]
        virus_name = row["Virus Name"]
        
        # Retrieve metadata from NCBI
        pub_date, host = get_metadata(accession)
        
        if pub_date:
            # Try to extract the year from the publication date.
            # Publication date formats can vary: "02-JUL-2018", "2018-07-02", or just "2018".
            try:
                # Attempt a common date format: "02-JUL-2018"
                try:
                    date_obj = datetime.strptime(pub_date, "%d-%b-%Y")
                except Exception:
                    # Try ISO format "YYYY-MM-DD"
                    try:
                        date_obj = datetime.strptime(pub_date, "%Y-%m-%d")
                    except Exception:
                        # If only a year is provided, parse it directly.
                        date_obj = datetime.strptime(pub_date, "%Y")
                
                # Check if the publication year is within the threshold
                if current_year - date_obj.year <= years_threshold:
                    writer.writerow({
                        "Accession": accession,
                        "Virus Name": virus_name,
                        "Publication Date": pub_date,
                        "Host": host
                    })
            except Exception as e:
                print(f"Error parsing publication date for {accession}: {e}")
        else:
            print(f"No publication date found for {accession}")
