In [7]:
# !pip install biopython

from Bio import Entrez
from Bio import SeqIO
import time
import os
import random
import pandas

# Download Genomes

In [15]:
# This Colab notebook downloads a specific list of viral genomes from NCBI
# using a predefined dictionary of virus names and their accession numbers.


# ALWAYS set your email address for NCBI Entrez.
# This is a requirement from NCBI.
Entrez.email = "maiwald.aaron@outlook.de"
# A tool name is also recommended.
Entrez.tool = "ResearchSprint"

# A dictionary mapping virus names to their NCBI accession numbers.
# This ensures a direct and precise download for each virus on your list.
virus_accessions = {
    "Human immunodeficiency virus 1": "NC_001802.1", #checked
    "Human alphaherpesvirus 3": " NC_001348.1", # check
    "Chikungunya virus": "NC_004162.2", # check
    "Human coronavirus 229E": "NC_002645.1", # check
    "Human coronavirus NL63": "NC_005831.2", # check
    "Human coronavirus OC43": "NC_006213.1", # check
    "Human rhinovirus A": "NC_001490.1",
    "Human rhinovirus B": "NC_001490.1", # Using same as Rhinoviruses are complex. Can change as needed.
    "Human adenovirus 54": "NC_012959.1", # check
    "SARS-CoV-2": "NC_045512.2", #check
    "Ebola virus": "NC_002549.1", #check
    "Astrovirus MLB1": "	NC_011400", #check
    "Influenza A virus H1N1": "NC_026433.1", # check
    "Influenza A virus H2N2": "NC_007357.1", #check
    "Influenza A virus H3N2": "NC_007361.1", #check
    "Measles morbillivirus": "NC_001498.1",
    "MERS virus": "NC_019843.3",
    "Human metapneumovirus": "NC_004148.2",
    "Human herpesvirus 4 type 2": "NC_009334.1",
    "Monkeypox virus": "NC_063383.1",
    "Mumps orthorubulavirus": "NC_002200.1",
    "Henipavirus nipahense": "NC_002728.1",
    "Human parainfluenza virus 4a": "NC_003461.1",
    "Human respirovirus 3": "NC_001796.1",
    "Parainfluenza virus 5": "MT160087.1", # check
    "Human adenovirus 1": "NC_001405.1",
    "Human adenovirus 5": "NC_001405.1", # same as human adenovirus 1
    "Human respirovirus": "NC_001804.1", # Assuming this is RSV
    "Rubella virus": "NC_001545.1"
}

def download_specific_genomes(virus_accessions, output_dir="viral_genomes_specific"):
  """
  Downloads viral genomes for a given dictionary of virus names and their accession IDs.
  The genomes are saved in separate files, named by the virus.

  Args:
    virus_accessions (dict): A dictionary mapping virus names to accession IDs.
    output_dir (str): The directory to save the genome files.
  """
  if not virus_accessions:
    print("No virus accessions provided. Exiting.")
    return

  # Create output directory if it doesn't exist.
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  print(f"\nDownloading {len(virus_accessions)} specific genomes to '{output_dir}/'...")

  # The database is "nucleotide" and the format is FASTA.
  db = "nucleotide"
  rettype = "fasta"

  for virus_name, accession_id in virus_accessions.items():
    try:
      # Use Entrez.efetch to download the FASTA record for the specific accession ID.
      handle = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode="text")
      record = handle.read()
      handle.close()

      # Clean up the virus name to be a valid filename.
      file_name = virus_name.replace(" ", "_").replace("/", "_").replace(",", "_")
      file_path = os.path.join(output_dir, f"{file_name}_{accession_id}.fasta")

      with open(file_path, "w") as f:
        f.write(record)

      print(f"Downloaded '{virus_name}' (Accession: {accession_id}) to '{file_path}'")

    except Exception as e:
      print(f"An error occurred while downloading {virus_name} (Accession: {accession_id}): {e}")

    # Pause to respect NCBI's rate limits.
    time.sleep(random.uniform(0.5, 1.0))

# --- Main script execution ---

# Step 1: Download all specified genomes.
download_specific_genomes(virus_accessions)

print("\nDownload process completed.")

# To run this in a Colab notebook, you can simply paste the code.
# The `!pip install biopython` command will handle the dependency.
# The code will create a folder named `viral_genomes_specific` and save the
# downloaded FASTA files inside it, named by the virus.



Downloading 31 specific genomes to 'viral_genomes_specific/'...
Downloaded 'Human immunodeficiency virus 1' (Accession: NC_001802.1) to 'viral_genomes_specific/Human_immunodeficiency_virus_1_NC_001802.1.fasta'
Downloaded 'Human alphaherpesvirus 3' (Accession:  NC_001348.1) to 'viral_genomes_specific/Human_alphaherpesvirus_3_ NC_001348.1.fasta'
Downloaded 'Chikungunya virus' (Accession: NC_004162.2) to 'viral_genomes_specific/Chikungunya_virus_NC_004162.2.fasta'
Downloaded 'Human coronavirus 229E' (Accession: NC_002645.1) to 'viral_genomes_specific/Human_coronavirus_229E_NC_002645.1.fasta'
Downloaded 'Human coronavirus NL63' (Accession: NC_005831.2) to 'viral_genomes_specific/Human_coronavirus_NL63_NC_005831.2.fasta'
Downloaded 'Human coronavirus OC43' (Accession: NC_006213.1) to 'viral_genomes_specific/Human_coronavirus_OC43_NC_006213.1.fasta'
Downloaded 'Human rhinovirus A' (Accession: NC_001490.1) to 'viral_genomes_specific/Human_rhinovirus_A_NC_001490.1.fasta'
Downloaded 'Human rhi

# Create CSV dataset

In [27]:
import pandas as pd

def load_genomes_as_dataframe(input_dir="viral_genomes_specific"):
    """
    Loads all downloaded FASTA files into a pandas DataFrame.
    """
    data = []

    # Get a list of all FASTA files in the input directory.
    fasta_files = [f for f in os.listdir(input_dir) if f.endswith(".fasta")]

    if not fasta_files:
        print(f"No FASTA files found in directory '{input_dir}'.")
        return pd.DataFrame()

    # Create a reverse mapping for robust lookup
    accession_to_info = {info[0]: name for name, info in virus_accessions.items()}
    print(accession_to_info)

    for filename in fasta_files:
        file_path = os.path.join(input_dir, filename)

        try:
            with open(file_path, "r") as f:
                record = SeqIO.read(f, "fasta")

                # get the header of the record
                header = record.description
                accession_id = record.id

                #split after first space " " and before first ","
                header = header.split(" ", 1)[1]
                virus_name = header.split(",")[0]

                print(header)

                data.append({
                    "accession_id": accession_id,
                    "virus_name": virus_name,
                    "genome_sequence": str(record.seq),
                    "label": "natural"
                })

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    return pd.DataFrame(data)

# Define the directory for downloads and the output file for the dataset.
download_dir = "viral_genomes_specific"

# Step 2: Load the downloaded genomes into a single DataFrame.
all_genomes_df = load_genomes_as_dataframe(input_dir=download_dir)


{'N': 'Rubella virus', ' ': 'Human alphaherpesvirus 3', '\t': 'Astrovirus MLB1', 'M': 'Parainfluenza virus 5'}
Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Parainfluenza virus 5 strain L, complete genome
Monkeypox virus, complete genome
Human herpesvirus 4, complete genome
Influenza A virus (A/Goose/Guangdong/1/96(H5N1)) neuraminidase (NA) gene, complete cds
Measles virus, complete genome
Rhinovirus B14, complete sequence
Influenza A virus (A/Goose/Guangdong/1/96(H5N1)) polymerase (PB2) gene, complete cds
Human parainfluenza virus 1, complete genome
Zaire ebolavirus isolate Ebola virus/H.sapiens-tc/COD/1976/Yambuku-Mayinga, complete genome
Rhopalomyia pomum mitochondrion, complete genome
Chikungunya virus, complete genome
Human immunodeficiency virus 1, complete genome
Nipah virus, complete genome
Human coronavirus OC43 strain ATCC VR-759, complete genome
Human adenovirus C, complete genome
Bovine parainfluenza virus 3, complete genome
Human adeno

In [55]:
name_to_fam_dict = {
    "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1": "Coronaviridae",
    "Parainfluenza virus 5 strain L": "Paramyxoviridae",
    "Monkeypox virus": "Poxviridae",
    "Human herpesvirus 4": "Herpesviridae",
    "Influenza A virus (A/Goose/Guangdong/1/96(H5N1)) neuraminidase (NA) gene": "Orthomyxoviridae",
    "Measles virus": "Paramyxoviridae",
    "Rhinovirus B14": "Picornaviridae",
    "Influenza A virus (A/Goose/Guangdong/1/96(H5N1)) polymerase (PB2) gene": "Orthomyxoviridae",
    "Human parainfluenza virus 1": "Paramyxoviridae",
    "Zaire ebolavirus isolate Ebola virus/H.sapiens-tc/COD/1976/Yambuku-Mayinga": "Filoviridae",
    "Rhopalomyia pomum mitochondrion": "Non-viral",
    "Chikungunya virus": "Togaviridae",
    "Human immunodeficiency virus 1": "Retroviridae",
    "Nipah virus": "Paramyxoviridae",
    "Human coronavirus OC43 strain ATCC VR-759": "Coronaviridae",
    "Human adenovirus C": "Adenoviridae",
    "Bovine parainfluenza virus 3": "Paramyxoviridae",
    "Mumps orthorubulavirus genomic RNA": "Paramyxoviridae",
    "Human metapneumovirus": "Pneumoviridae",
    "Rubella virus": "Togaviridae",
    "Human parainfluenza virus 3 strain JS": "Paramyxoviridae",
    "Human Coronavirus NL63": "Coronaviridae",
    "Influenza A virus (A/California/07/2009(H1N1)) segment 4 hemagglutinin (HA) gene": "Orthomyxoviridae",
    "Middle East respiratory syndrome-related coronavirus isolate HCoV-EMC/2012": "Coronaviridae",
    "Human coronavirus 229E": "Coronaviridae",
    "Astrovirus MLB1": "Astroviridae",
    "Human herpesvirus 3": "Herpesviridae",
    "Human adenovirus 54": "Adenoviridae"
}

all_genomes_df['family'] = all_genomes_df['virus_name'].map(name_to_fam_dict)
all_genomes_df

Unnamed: 0,accession_id,virus_name,genome_sequence,label,family
0,NC_045512.2,Severe acute respiratory syndrome coronavirus ...,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,natural,Coronaviridae
1,MT160087.1,Parainfluenza virus 5 strain L,ACCAGGGGGAAAACGAAGTGGTGATTCAAATCATAGAAGACACTCG...,natural,Paramyxoviridae
2,NC_063383.1,Monkeypox virus,ATTTTACTATTTTATTTAGTGTCTAGAAAAAAATGTGTGACCCACG...,natural,Poxviridae
3,NC_009334.1,Human herpesvirus 4,AGAATTTGTCTTGCTCTATTCACCGTTACTTTTCTTCTTGCCCGTT...,natural,Herpesviridae
4,NC_007361.1,Influenza A virus (A/Goose/Guangdong/1/96(H5N1...,AGCAAAAGCAGGAGATTAAAATGAATCCAAATCAGAAGATAATAAC...,natural,Orthomyxoviridae
5,NC_001498.1,Measles virus,ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCT...,natural,Paramyxoviridae
6,NC_001490.1,Rhinovirus B14,TTAAAACAGCGGATGGGTATCCCACCATTCGACCCATTGGGTGTAG...,natural,Picornaviridae
7,NC_007357.1,Influenza A virus (A/Goose/Guangdong/1/96(H5N1...,AGCAAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAAC...,natural,Orthomyxoviridae
8,NC_003461.1,Human parainfluenza virus 1,ACCAAACAAGAGGAAAAACTTGTTTGGAATATATAATAATATTAAA...,natural,Paramyxoviridae
9,NC_002549.1,Zaire ebolavirus isolate Ebola virus/H.sapiens...,CGGACACACAAAAAGAAAGAAGAATTTTTAGGATCTTTTGTGTGCG...,natural,Filoviridae


In [35]:
print(all_genomes_df['genome_sequence'].str.len())

0      29903
1      15246
2     197209
3     172764
4       1458
5      15894
6       7212
7       2341
8      15600
9      18959
10     14503
11     11826
12      9181
13     18246
14     30741
15     35937
16     15456
17     35937
18     15384
19     13335
20      9755
21     15462
22     27553
23      1701
24     30119
25     27317
26      6171
27    124884
28     34920
29      7212
Name: genome_sequence, dtype: int64


# Split the data

In [68]:
def create_random_split_dataset(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Randomly assigns 'train', 'val', or 'test' splits to each genome.
    """
    print("\nCreating dataset with random splits...")

    # Shuffle the DataFrame rows.
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate split sizes.
    total_count = len(df_shuffled)
    train_count = int(total_count * train_ratio)
    val_count = int(total_count * val_ratio)

    # Assign splits.
    df_shuffled['split_id'] = 'test'
    df_shuffled.loc[:train_count, 'split_id'] = 'train'
    df_shuffled.loc[train_count:train_count+val_count, 'split_id'] = 'val'

    print("Random split distribution:")
    print(df_shuffled['split_id'].value_counts())

    return df_shuffled

def create_family_split_dataset(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Assigns 'train', 'val', or 'test' splits based on viral family while respecting genome count ratios.
    All members of a family are in the same split.
    """
    print("\nCreating dataset with family-based splits...")

    # Group the DataFrame by family and count the number of genomes in each.
    family_counts = df.groupby('family').size().to_dict()
    unique_families = list(family_counts.keys())
    random.shuffle(unique_families)

    # Calculate target genome counts for each split.
    total_genomes = len(df)
    target_train_count = int(total_genomes * train_ratio)
    target_val_count = int(total_genomes * val_ratio)
    target_test_count = total_genomes - target_train_count - target_val_count

    # Initialize splits and current counts.
    family_splits = {}
    current_counts = {'train': 0, 'val': 0, 'test': 0}

    for family in unique_families:
        genome_count = family_counts[family]

        # Determine the best split for this family to balance the counts.
        # Find the split with the most remaining capacity.
        train_capacity = target_train_count - current_counts['train']
        val_capacity = target_val_count - current_counts['val']
        test_capacity = target_test_count - current_counts['test']

        capacities = [
            (train_capacity, 'train'),
            (val_capacity, 'val'),
            (test_capacity, 'test')
        ]

        # Sort by capacity in descending order.
        capacities.sort(key=lambda x: x[0], reverse=True)

        # Assign the family to the split with the most capacity, if it fits.
        for capacity, split_name in capacities:
            if capacity >= genome_count or capacity == max(cap[0] for cap in capacities):
                family_splits[family] = split_name
                current_counts[split_name] += genome_count
                break

    # Assign split_id to each genome based on its family.
    df['split_id'] = df['family'].map(family_splits)

    print("Family-based split distribution:")
    print(df['split_id'].value_counts())

    return df


random_split_file = "random_split_dataset.csv"
family_split_file = "family_split_dataset.csv"


if not all_genomes_df.empty:
    # Step 3: Create the random split dataset.
    random_split_df = create_random_split_dataset(all_genomes_df.copy())
    random_split_df.to_csv(random_split_file, index=False)
    print(f"\nRandom-split dataset created successfully. Saved to '{random_split_file}'.")

    # Step 4: Create the family-based split dataset.
    family_split_df = create_family_split_dataset(all_genomes_df.copy())
    family_split_df.to_csv(family_split_file, index=False)
    print(f"\nFamily-split dataset created successfully. Saved to '{family_split_file}'.")

print("\nFull process (download and dataset creation) completed.")



Creating dataset with random splits...
Random split distribution:
split_id
train    21
val       5
test      4
Name: count, dtype: int64

Random-split dataset created successfully. Saved to 'random_split_dataset.csv'.

Creating dataset with family-based splits...
Family-based split distribution:
split_id
train    21
val       5
test      4
Name: count, dtype: int64

Family-split dataset created successfully. Saved to 'family_split_dataset.csv'.

Full process (download and dataset creation) completed.


In [67]:
family_split_df

Unnamed: 0,accession_id,virus_name,genome_sequence,label,family,split_id
0,NC_045512.2,Severe acute respiratory syndrome coronavirus ...,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,natural,Coronaviridae,train
1,MT160087.1,Parainfluenza virus 5 strain L,ACCAGGGGGAAAACGAAGTGGTGATTCAAATCATAGAAGACACTCG...,natural,Paramyxoviridae,train
2,NC_063383.1,Monkeypox virus,ATTTTACTATTTTATTTAGTGTCTAGAAAAAAATGTGTGACCCACG...,natural,Poxviridae,test
3,NC_009334.1,Human herpesvirus 4,AGAATTTGTCTTGCTCTATTCACCGTTACTTTTCTTCTTGCCCGTT...,natural,Herpesviridae,train
4,NC_007361.1,Influenza A virus (A/Goose/Guangdong/1/96(H5N1...,AGCAAAAGCAGGAGATTAAAATGAATCCAAATCAGAAGATAATAAC...,natural,Orthomyxoviridae,train
5,NC_001498.1,Measles virus,ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCT...,natural,Paramyxoviridae,train
6,NC_001490.1,Rhinovirus B14,TTAAAACAGCGGATGGGTATCCCACCATTCGACCCATTGGGTGTAG...,natural,Picornaviridae,train
7,NC_007357.1,Influenza A virus (A/Goose/Guangdong/1/96(H5N1...,AGCAAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAAC...,natural,Orthomyxoviridae,train
8,NC_003461.1,Human parainfluenza virus 1,ACCAAACAAGAGGAAAAACTTGTTTGGAATATATAATAATATTAAA...,natural,Paramyxoviridae,train
9,NC_002549.1,Zaire ebolavirus isolate Ebola virus/H.sapiens...,CGGACACACAAAAAGAAAGAAGAATTTTTAGGATCTTTTGTGTGCG...,natural,Filoviridae,train


In [79]:
!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
!conda install -c bioconda -c conda-forge insilicoseq --yes

--2025-09-12 19:46:41--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.32.241, 104.16.191.158, 2606:4700::6810:bf9e, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.32.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 162129736 (155M) [application/octet-stream]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh.1’


2025-09-12 19:46:42 (181 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh.1’ saved [162129736/162129736]

PREFIX=/usr/local
Unpacking bootstrapper...
Unpacking payload...

Installing base environment...

Preparing transaction: ...working... done
Executing transaction: ...working... done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best results, please verify that your PYTHONPATH only points to
    directories of packages that 

In [83]:
import subprocess

def generate_simulated_reads(input_dir="viral_genomes_specific", output_dir="simulated_reads", model="MiSeq", n_reads=10000):
  """
  Generates simulated sequencing reads for each genome in a directory.
  The reads are saved as FASTQ files, organized by virus.

  Args:
    input_dir (str): The directory containing the FASTA genome files.
    output_dir (str): The directory to save the simulated reads.
    model (str): The InSilicoSeq error model to use (e.g., MiSeq, HiSeq).
    n_reads (int): The number of reads to generate per genome.
  """
  # Get a list of all FASTA files.
  fasta_files = [f for f in os.listdir(input_dir) if f.endswith(".fasta")]
  if not fasta_files:
      print(f"No FASTA files found in directory '{input_dir}'. Skipping read generation.")
      return

  # Create output directory if it doesn't exist.
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  print("\nGenerating simulated reads...")

  for fname in fasta_files:
    file_path = os.path.join(input_dir, fname)
    virus_name = os.path.splitext(fname)[0] # e.g., Human_coronavirus_OC43_NC_005147.1

    # Create a unique output subdirectory for each virus.
    virus_output_dir = os.path.join(output_dir, virus_name)
    if not os.path.exists(virus_output_dir):
        os.makedirs(virus_output_dir)

    # The `iss generate` command for a single genome.
    command = [
        "iss", "generate",
        "--genomes", file_path,
        "--model", model,
        "--n_reads", str(n_reads),
        "--output", os.path.join(virus_output_dir, "reads"),
        "--cpus", "2"
    ]

    try:
        # Use subprocess to run the command and capture output
        print(f"  Generating reads for {fname}...")
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        print(f"  Read generation for {fname} completed.")

    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running InSilicoSeq for {fname}:")
        print(f"Command: {e.cmd}")
        print(f"Return code: {e.returncode}")
        print(f"Standard output:\n{e.stdout}")
        print(f"Standard error:\n{e.stderr}")
    except FileNotFoundError:
        print("Error: InSilicoSeq command not found. Please ensure it is installed correctly (e.g., via `!conda install -c bioconda -c conda-forge insilicoseq --yes`).")


generate_simulated_reads()



Generating simulated reads...
  Generating reads for SARS-CoV-2_NC_045512.2.fasta...
  Read generation for SARS-CoV-2_NC_045512.2.fasta completed.
  Generating reads for Parainfluenza_virus_5_MT160087.1.fasta...
  Read generation for Parainfluenza_virus_5_MT160087.1.fasta completed.
  Generating reads for Monkeypox_virus_NC_063383.1.fasta...
  Read generation for Monkeypox_virus_NC_063383.1.fasta completed.
  Generating reads for Human_herpesvirus_4_type_2_NC_009334.1.fasta...
  Read generation for Human_herpesvirus_4_type_2_NC_009334.1.fasta completed.
  Generating reads for Influenza_A_virus_H3N2_NC_007361.1.fasta...
  Read generation for Influenza_A_virus_H3N2_NC_007361.1.fasta completed.
  Generating reads for Measles_morbillivirus_NC_001498.1.fasta...
  Read generation for Measles_morbillivirus_NC_001498.1.fasta completed.
  Generating reads for Human_rhinovirus_A_NC_001490.1.fasta...
  Read generation for Human_rhinovirus_A_NC_001490.1.fasta completed.
  Generating reads for Inf

In [84]:
import os

# Define the path to a sample FASTQ file.
# You can change the virus name to inspect a different file.
virus_dir = "simulated_reads/Ebola_virus_NC_002549.1"
fastq_file = os.path.join(virus_dir, "reads_R1.fastq")

if os.path.exists(fastq_file):
    print(f"Inspecting the first 8 lines of {fastq_file}...")
    with open(fastq_file, 'r') as f:
        for i in range(8): # Read the first two full reads
            line = f.readline().strip()
            if line:
                print(line)
else:
    print(f"File not found: {fastq_file}. Please ensure the reads have been generated.")

Inspecting the first 8 lines of simulated_reads/Ebola_virus_NC_002549.1/reads_R1.fastq...
@NC_002549.1_0_0/1
TCTCCTTTTAGCAAAGTACTATTTCAGGGTAGTCCAATTAGTGGCACGTCTTTTAGCTGTATATCAGTCGCCCCTGAGATACGCCACAAAAGTGTCTCTAAGCTAAATTGGTCTGTACACATCCCATACATTGTATTAGGGGCAATAATATCTAATTGAACTTAGCCGTTTAAAATTTAGTGCATAAATCTGGGCTAACACCACCAGGTCAACTCCATTGGCTGAAAAGAAGCTTACCTACAACGAACATCACTTTGAGCGCCCTCACAATTAAAAAATAGGAAAGTCGTTCCAGCAATCG
+
CCCCCGGGGGGGGGGGCGGGGGGGGGGGGGGCGGGGGGGGGFGGGGGCGGGGGGGGGGGCGGGFGFEGGGGGGGGGGGGGGGGGGFGGGGGCGGGGEGGGGEGGGFGGGGGEGFGGGGFFG7GGGGGGGFGFGFGFGGGFGGGGGGG,FFGG8GGGFFGGFGGFEGG8GGGFGGFFGGCEGFGGGGGGCGEFGGFE*CGF,GEGGGGGGCFG>FCGGGCGEGFG=GCGG;FG=GEF+5GFCG=GCGC<=E,GGGGG+7:C<CG5?@AF4G*4ECD=)FF0>9C:);;GF@GG<*+:F9FF)
@NC_002549.1_1_0/1
TTAAGAAATTACCAGAGTTGATTAGTGTGTGCAATAGGTTCTACCATATTAGAGATTGCAATTGTGAAGAACGTTTCTTAGTTCAAACCTTATATTTACATAGAATGCAGGATTCTGAAGTTAAGCTTATCGAAAGGCTGACAGGGCTTCTGAGTTTATTTCCGGATGGTCTCTACAGGTTTGATTGAACTACCGTGCATAGTATCCTGATACTTGCAAAGGTTGGTTATTAACATACAGATTATAAAAAACTCATAAATTGCTCT