In [3]:
import numpy as np
from Bio import SeqIO
from typing import Dict, List

In [4]:
def describe_fasta(fasta_file: str) -> None:
    seq_lengths: List[int] = []
    class_counts: Dict[str, int] = {}
    
    # Parse the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Extract class from the header, assuming it's the last part after '|'
        seq_class: str = record.description.split('|')[-1]
        
        # Update class counts
        class_counts[seq_class] = class_counts.get(seq_class, 0) + 1
        
        # Store length of sequence for statistics
        seq_lengths.append(len(record.seq))
    
    # Calculate total number of sequences, mean length, and standard deviation
    total_sequences: int = len(seq_lengths)
    mean_length: float = np.mean(seq_lengths)
    std_dev_length: float = np.std(seq_lengths)
    
    # Output results
    print(f"File: {fasta_file}")
    print(f"Total number of classes: {len(class_counts)}")
    print(f"Total number of sequences: {total_sequences}")
    print(f"Mean sequence length: {mean_length:.2f}")
    print(f"Standard deviation of sequence lengths: {std_dev_length:.2f}")
    print("Number of sequences per class (sorted by count):")
    
    sorted_classes = sorted(class_counts.items(), key=lambda item: item[1], reverse=True)
    for seq_class, count in sorted_classes:
        print(f"  {seq_class}: {count}")
    print()  # For better separation between file outputs

# Usage
fasta_files = [
    '../data/Human_betaherpesvirus_5/UL55/UL55_nucleotide_sequences.fasta',
    '../data/Human_betaherpesvirus_5/UL73/UL73_nucleotide_sequences.fasta',
    '../data/Human_betaherpesvirus_5/US28/US28_nucleotide_sequences.fasta',
    '../data/Severe_acute_respiratory_syndrome_coronavirus_2/Severe_acute_respiratory_syndrome_coronavirus_2_nucleotide_sequences.fasta',
    '../data/Human_immunodeficiency_virus_1/Human_immunodeficiency_virus_1_nucleotide_sequences.fasta'
]

for fasta_file in fasta_files:
    describe_fasta(fasta_file)


File: ../data/Human_betaherpesvirus_5/UL55/UL55_nucleotide_sequences.fasta
Total number of classes: 4
Total number of sequences: 367
Mean sequence length: 2721.65
Standard deviation of sequence lengths: 2.71
Number of sequences per class (sorted by count):
  1: 190
  3: 75
  2: 69
  4: 33

File: ../data/Human_betaherpesvirus_5/UL73/UL73_nucleotide_sequences.fasta
Total number of classes: 7
Total number of sequences: 524
Mean sequence length: 412.54
Standard deviation of sequence lengths: 3.79
Number of sequences per class (sorted by count):
  1: 102
  4a: 100
  4c: 89
  3a: 89
  4b: 57
  3b: 44
  2: 43

File: ../data/Human_betaherpesvirus_5/US28/US28_nucleotide_sequences.fasta
Total number of classes: 6
Total number of sequences: 423
Mean sequence length: 1064.92
Standard deviation of sequence lengths: 0.60
Number of sequences per class (sorted by count):
  A1: 159
  A2: 131
  D: 46
  C: 33
  B1: 29
  B2: 25

File: ../data/Severe_acute_respiratory_syndrome_coronavirus_2/Severe_acute_re