<a href="https://colab.research.google.com/github/animesh-11/AI_ML/blob/main/DNA_Mutations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def filter_dna_mutations(bases, length, harmful_patterns):
    """
    Generates all possible DNA sequences of a given length, filters out
    harmful patterns, and returns the remaining sequences in sorted order.

    Args:
        bases (str): A string of unique uppercase letters representing the bases.
        length (int): The target length of each sequence.
        harmful_patterns (list of str): A list of harmful DNA configurations to exclude.

    Returns:
        list of str: A list of valid, non-harmful DNA sequences, sorted
                     in lexicographic order.
    """

    # Step 1: Generate all possible sequences of the given length

    # We start with a list containing a single empty string. This allows us to
    # build the sequences iteratively in the loop.
    sequences = ['']

    # We loop 'length' number of times to build sequences of the correct length.
    for _ in range(length):
        # This list will hold the newly generated sequences for the current length.
        next_sequences = []

        # We loop through each sequence we have so far.
        for seq in sequences:
            # And for each base, we append it to the current sequence.
            for base in bases:
                next_sequences.append(seq + base)

        # We update our list of sequences for the next iteration.
        sequences = next_sequences

    # Step 2: Filter out any sequences that match the harmful patterns.

    # To make the filtering process faster, we convert the list of harmful patterns
    # into a set for efficient lookup.
    harmful_set = set(harmful_patterns)

    # We create a new list to store the sequences that are NOT harmful.
    valid_sequences = []

    # We iterate through all the generated sequences.
    for seq in sequences:
        # If a sequence is not in our set of harmful patterns, it's valid.
        if seq not in harmful_set:
            valid_sequences.append(seq)

    # Step 3: Sort the remaining sequences in lexicographic order.
    valid_sequences.sort()

    return valid_sequences

# Example Cases
print("Example Case 1:")
bases_1 = 'AT'
length_1 = 2
harmful_patterns_1 = ['AA', 'TT']
result_1 = filter_dna_mutations(bases_1, length_1, harmful_patterns_1)
print(f"Input: ('{bases_1}', {length_1}, {harmful_patterns_1})")
print(f"Output: {result_1}\n")

print("Example Case 2:")
bases_2 = 'AGC'
length_2 = 2
harmful_patterns_2 = ['GC']
result_2 = filter_dna_mutations(bases_2, length_2, harmful_patterns_2)
print(f"Input: ('{bases_2}', {length_2}, {harmful_patterns_2})")
print(f"Output: {result_2}\n")


Example Case 1:
Input: ('AT', 2, ['AA', 'TT'])
Output: ['AT', 'TA']

Example Case 2:
Input: ('AGC', 2, ['GC'])
Output: ['AA', 'AC', 'AG', 'CA', 'CC', 'CG', 'GA', 'GG']



In [None]:
def filter_dna_mutations_verbose(bases, length, harmful_patterns):
    """
    Generates all possible DNA sequences of a given length, filters out
    harmful patterns, and returns the remaining sequences in sorted order.
    Includes print statements to show the process step-by-step.

    Args:
        bases (str): A string of unique uppercase letters representing the bases.
        length (int): The target length of each sequence.
        harmful_patterns (list of str): A list of harmful DNA configurations to exclude.

    Returns:
        list of str: A list of valid, non-harmful DNA sequences, sorted
                     in lexicographic order.
    """

    print(f"Starting sequence generation with bases: {bases} and length: {length}")
    # Step 1: Generate all possible sequences of the given length
    sequences = ['']
    print(f"Initial sequences: {sequences}")

    for i in range(length):
        print(f"\nIteration {i+1} (building sequences of length {i+1})")
        next_sequences = []
        print(f"Current sequences: {sequences}")

        for seq in sequences:
            for base in bases:
                new_seq = seq + base
                next_sequences.append(new_seq)
                print(f"  Appending '{new_seq}' (from '{seq}' + '{base}')")

        sequences = next_sequences
        print(f"Sequences after iteration {i+1}: {sequences}")

    print("\nStep 2: Filtering harmful patterns")
    harmful_set = set(harmful_patterns)
    print(f"Harmful patterns set: {harmful_set}")

    valid_sequences = []
    print(f"All generated sequences: {sequences}")
    for seq in sequences:
        if seq not in harmful_set:
            valid_sequences.append(seq)
            print(f"  '{seq}' is NOT harmful, adding to valid sequences.")
        else:
            print(f"  '{seq}' IS harmful, skipping.")

    print("\nStep 3: Sorting valid sequences")
    print(f"Valid sequences before sorting: {valid_sequences}")
    valid_sequences.sort()
    print(f"Valid sequences after sorting: {valid_sequences}")


    return valid_sequences

# Run with the first example case
print("Running verbose explanation with Example Case 1:")
bases_1 = 'AT'
length_1 = 2
harmful_patterns_1 = ['AA', 'TT']
result_1_verbose = filter_dna_mutations_verbose(bases_1, length_1, harmful_patterns_1)
print(f"\nFinal Output for Example Case 1: {result_1_verbose}")

Running verbose explanation with Example Case 1:
Starting sequence generation with bases: AT and length: 2
Initial sequences: ['']

Iteration 1 (building sequences of length 1)
Current sequences: ['']
  Appending 'A' (from '' + 'A')
  Appending 'T' (from '' + 'T')
Sequences after iteration 1: ['A', 'T']

Iteration 2 (building sequences of length 2)
Current sequences: ['A', 'T']
  Appending 'AA' (from 'A' + 'A')
  Appending 'AT' (from 'A' + 'T')
  Appending 'TA' (from 'T' + 'A')
  Appending 'TT' (from 'T' + 'T')
Sequences after iteration 2: ['AA', 'AT', 'TA', 'TT']

Step 2: Filtering harmful patterns
Harmful patterns set: {'AA', 'TT'}
All generated sequences: ['AA', 'AT', 'TA', 'TT']
  'AA' IS harmful, skipping.
  'AT' is NOT harmful, adding to valid sequences.
  'TA' is NOT harmful, adding to valid sequences.
  'TT' IS harmful, skipping.

Step 3: Sorting valid sequences
Valid sequences before sorting: ['AT', 'TA']
Valid sequences after sorting: ['AT', 'TA']

Final Output for Example Ca