Assignment 2 student 2 BINP16 HT25

Author: Ariane Neumann

Date: 2025-10-16

Version:1.0

Sytem: #!/usr/bin/envA python3 - coding: utf-8 -

Description: This code will process a fasta file containing DNA sequences, translating them into protein sequences, count the abundances of the individual amino acids and give an output file. Different steps will be taken to achieve this:

- Inspecting all available files
- Creating a dictionary for the standard codons
- Reading the files
- Writing the script
- Handle errors

Functions created: generate_genetic_code, read_fasta, translate_DNA


---------------------------------------------------------------------------------------------------------------------------------------

In [58]:
# Preparation of the standard genetic code
import sys
import os

if len(sys.argv) != 3:
    sys.argv = ["dna2aa.py", "DNA_seq.fna", "output_file.txt"] 
DNA_seq = sys.argv[1]
output_file = sys.argv[2]

def generate_genetic_code():
    bases = ['T', 'C', 'A', 'G']
    # Making a dictionary with the codons (key) and the corresponding amino acid (value)
    genetic_code = {'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',  # Valine
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',  # Alanine
    'GAT': 'D', 'GAC': 'D',  # Aspartic Acid
    'GAA': 'E', 'GAG': 'E',  # Glutamic Acid
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',  # Glycine
    'TTT': 'F', 'TTC': 'F',  # Phenylalanine
    'TTA': 'L', 'TTG': 'L',  # Leucine
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',  # Serine
    'TAT': 'Y', 'TAC': 'Y',  #Tyrosine
    'TGT': 'C', 'TGC': 'C',  # Cysteine
    'TGG': 'W',  # Tryptophan
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',  # Leucine
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',  # Proline
    'CAT': 'H', 'CAC': 'H',  # Histidine
    'CAA': 'Q', 'CAG': 'Q',  # Glutamine
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',  # Arginine
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', # Isoleucine
    'ATG': 'M',  #  Methionine (Start)
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',  # Threonine
    'AAT': 'N', 'AAC': 'N',  # Asparagine
    'AAA': 'K', 'AAG': 'K',  # Lysine
    'AGT': 'S', 'AGC': 'S',  # Serine
    'AGA': 'R', 'AGG': 'R',  # Arginine
    'TAA': '*', 'TAG': '*', 'TGA': '*'} # Stop (*)

#--------------------------------------------------------------------------------------------------------------------------

# Define "Reading the fasta file" function
def read_fasta(file_path):
    sequences = {}
    current_ID = None
    current_sequence_lines = []
    valid_DNA_char = {'A', 'C', 'T', 'G', 'N'} # Define a set of valid DNA characters for validation

# Using 'with open' for secure file handling
    try: 
        with open(file_path, 'r') as f:
            line_number = 0
            for line in f:
                line_number += 1
                line=line.strip() #removing whitespace
                if line.startswith(">"):
                    if current_ID:
                        sequences[current_ID] = "".join(current_sequence_lines)
                    current_ID=line[1:].strip()
                    current_sequence_lines=[]
                elif current_ID:
                    clean_line=line.upper()
                    for char in clean_line:
                        if char not in valid_DNA_char:
                            raise ValueError(f" Found invalid character '{first_invalid_char}' in sequence {current_ID}. Only A, T, C, G, N are allowed." )
                        current_sequence_line.append(clean_line)
                        if current_ID and current_sequence_lines:
                            sequences[current_ID]="".join(current_sequence_lines)

    except FileNotFoundError:
        print(f"Error:Input file not found at '{file_path}'", file=sys.stderr)
        return None
    except ValueError:
        print(f"Error in file content", file=sys.stderr)
        return None
    if not sequences:
        print(" Error: No sequences found in input file.", file=sys.stderr)
        return sequences
        
#--------------------------------------------------------------------------------------------------------------------------"

# Define " Translate DNA" function
def translate_DNA (DNA_sequence, codon_table): #->str
    aa_sequence=[] # empty list
    sequence_length=len(DNA_sequence)
    i=0

    while i < sequence_length -2:
        codon=DNA_sequence[i:i+3]

        # need to check for base "N" (unknown base)
        if "N" in codon:
            aa_sequence.append("X")
        else:
            amino_acid=codon_table.get(codon, "X")
            aa_sequence.append(amino_acid)
        i +=3
    return "".join(aa_sequence)

#---------------------------------------------------------------------------------------------------------------------------

# Reading a fasta file in each line to read one a screen, the max length is set to 60
maximum_line_length =60

# Define "execution of translation" function
def exec():
    if len(sys.argv) <2:
        print(f"Usage: python3 {os.path.basename(sys.argv[0])} <input_DNA_fasta> [output_aa_fasta]", file=sys.stderr)
        sys.exit(1) #save and gentle exit

    DNA_seq.fna=sys.argv[1]
    output_file=sys.argv[2] if len(sys.argv) >2 else "translated_aa.faa"
    print(f"Start DNA translation from '{input_file}' to '{output_file}' ...")

    # Read fasta input file
    dna_sequences=read_fasta(input_file)
    if dna_sequences is None:
        sys.exit(1) # in case no file is found, exit safely

    # create an empty dictionary
    translated_sequences={}
    #translate the sequences with FOR loop
    for seq_ID, DNA_seq in dna_sequences.items():
        if len(DNA_seq) <3:
            print(f" Warning: Sequence '{seq_ID}' is too short for a translation (length: {len(DNA_seq)}). Skip this.", file=sys.stderr)
            continue

        aa_seq=translate_dna(DNA_seq, genetic_code)
        translated_sequences[seq_ID]=aa_seq

        # Write an output file
        output_successful = False

    try:
        with open(output_file,"w") as output_f:
            for seq_ID, aa_seq in translated_sequences.items():
                output_f.write(f">{seq_ID}\n")
                i=0
                while i < len(aa_seq):
                    output_f.write(aa_seq[i:i+ maximum_line_length] + '\n')
                    i += maximum_line_length
            output_successful=True
    except FileNotFoundError:
        print(f"Error: Input file not found at '{file_path}'", file=sys.stderr)
        return None
    except ValueError:
        print(f"Error in file content", file=sys.stderr)
        return None      