In [1]:
import os

# Create directory if it doesn't exist
os.makedirs("day5", exist_ok=True)

# Create a file with a sample DNA sequence
sequence_a = "AACCTTGGAACGVVD"
sequence_b = "GGTTCGTAAGT"

# Writing the sequences to test files
with open("day5/a_seq.txt", "w") as file_a:
    file_a.write(sequence_a)

with open("day5/b_seq.txt", "w") as file_b:
    file_b.write(sequence_b)

# Function to count digits in a string
def count_digits_in_string(s):
    count = 0
    for char in s:
        if char.isdigit():
            count += 1
    return count

# Function to count digits in a file
def count_digits_in_file(filename):
    count = 0
    with open(filename, 'r') as file:
        for line in file:
            count += sum(1 for char in line if char.isdigit())
    return count



# Function to count occurrences of each base in a sequence
def count_bases(sequence):
    base_counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'Unknown': 0}
    total_bases = 0

    for base in sequence:
        if base in base_counts:
            base_counts[base] += 1
        else:
            base_counts['Unknown'] += 1
        total_bases += 1

    return base_counts, total_bases

# Function to calculate and display statistics for sequences from files
def calculate_statistics(files):
    all_counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'Unknown': 0}
    total_count = 0

    for file in files:
        with open(file, 'r') as f:
            sequence = f.read().strip()  # Read the sequence and strip whitespace/newlines
            base_counts, total_bases = count_bases(sequence)

            # Display statistics for this file
            print(f"Stats for {file}:")
            for base, count in base_counts.items():
                percentage = (count / total_bases) * 100 if total_bases > 0 else 0
                print(f"{base}: {count} {percentage:.1f}%")
            print(f"Total: {total_bases}\n")

            # Accumulate counts for overall statistics
            for base, count in base_counts.items():
                all_counts[base] += count
            total_count += total_bases

    # Display overall statistics
    print("All:")
    for base, count in all_counts.items():
        percentage = (count / total_count) * 100 if total_count > 0 else 0
        print(f"{base}: {count} {percentage:.1f}%")
    print(f"Total: {total_count}")
    return all_counts



if __name__ == '__main__':
    # Test the sequence stats program with example files
    files = ["day5/a_seq.txt", "day5/b_seq.txt"]
    calculate_statistics(files)


Stats for day5/a_seq.txt:
A: 4 26.7%
C: 3 20.0%
G: 3 20.0%
T: 2 13.3%
Unknown: 3 20.0%
Total: 15

Stats for day5/b_seq.txt:
A: 2 18.2%
C: 1 9.1%
G: 4 36.4%
T: 4 36.4%
Unknown: 0 0.0%
Total: 11

All:
A: 6 23.1%
C: 4 15.4%
G: 7 26.9%
T: 6 23.1%
Unknown: 3 11.5%
Total: 26
