In [502]:
# Import the random module for random data
import random
import numpy as np
import math

In [501]:
print('''
`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_
''')


`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_



## DNA
---------
This is a DNA data structure that has a left and right backbone which mimic the sugar and phosphate backbone. Then there are two vectors consisting of $\mathbf{AT}$, adenine and thymine, and $\mathbf{CG}$, Guanine and Cytosine respectively.

The backbones consist of quadruply linked sentinels which allow for easy manuevering of the DNA structure.

The $\mathbf{AT}$ and $\mathbf{CG}$ classes cosist of an entry that stores the value and is quadruply linked with connections to the other $\mathbf{AT}$ or $\mathbf{CG}$ node respectively and then is connected to the respective backbone.

In [558]:
class DNA:
    def __init__(self):
        # I thought this was how you instantiated inner classes but maybe im just stupid
        # self.right_backbone = self.right_backbone()
        # self.left_backbone = self.left_backbone()
        # self.AT = self.AT(entry=None)
        # self.CG = self.CG(entry = None)
        self.headL =  None
        self.headR = None
        self.fitness = -1
        self.length = 0
    #This is the right sugar-phosphate backbone
    class right_backbone:
        def __init__(self, entry=None):
            self.entry = None
            self.next = None
            self.prev = None
            self.left = None
    # This is the left sugar-phosphat backbone
    class left_backbone:
        def __init__(self,entry=None):
            self.entry = None
            self.next = None
            self.prev = None
            self.right = None
    #Central nodes (i.e. Adenine and Thymine)
    class AT:
        def __init__(self, entry):
            self.entry = entry
            self.left = None
            self.right = None

    # Central nodes (i.e. Guanine and Cytosine)
    class CG:
        def __init__(self, entry):
            self.entry = entry
            self.right = None
            self.left = None

    def insert_from_front(self, leftAT, rightCG):
        """This inserts a new row of genetic information to the front of the DNA"""
        #This defines the sentinel backbone
        self.length += 1
        new_sentinel_left = self.left_backbone()
        new_sentinel_right = self.right_backbone()
        new_AT = self.AT(leftAT)
        new_CG = self.CG(rightCG)
        new_sentinel_left.right = new_AT
        new_sentinel_right.left = new_CG
        new_AT.left = new_sentinel_left
        new_AT.right = new_CG
        new_CG.right  = new_sentinel_right
        new_CG.left = new_AT
        if (self.headL == None and self.headR == None ):
            self.headL = new_sentinel_left
            self.headR = new_sentinel_right
        else:
            self.headL.prev = new_sentinel_left
            self.headR.prev = new_sentinel_right 
            new_sentinel_left.next = self.headL
            new_sentinel_right.next = self.headR
            self.headL = new_sentinel_left
            self.headR = new_sentinel_right

    def print_dna(self):
        '''This loops throguh and prints the DNA'''
        if (self.headL == None and self.headR == None):
            print("DNA strand is empty")
            return
        currL = self.left_backbone()
        currR = self.right_backbone()
        currL = self.headL
        currR = self.headR
        while (currL != None and currR != None):
            print(f"|== {currL.right.entry} == {currR.left.entry} ==|")
            # print(f"O == {currR.left.entry} == O")
            currL = currL.next
            currR = currR.next
    def get_inner_product(self):
        '''inner product is defined as /theta = arccos( (x * y ) / ||x||||y|| ) with * being defined as the dot product in the VS) ''' 
        currL = self.headL 
        currR = self.headR
        dot_product = 0 
        norm_AT = 0
        norm_CG = 0
        while currL != None and currR != None:
            dot_product += np.dot(currL.right.entry, currR.left.entry)
            norm_AT += (abs((currL.right.entry))**2)
            norm_CG += (abs((currR.left.entry))**2)
            currL = currL.next
            currR = currR.next
        norm = math.sqrt(norm_AT) * math.sqrt(norm_CG)
        numerator = dot_product
        denominator = norm

        # Calculate cosine of the angle between the vectors
        if denominator > 0:
            cosine_theta = numerator / denominator
        else:
            # Handle division by zero or very small norms
            cosine_theta = 0.0  # Set default value to avoid invalid input to arccos
    
        # Clip cosine_theta to ensure it's within the valid range [-1, 1]
        cosine_theta = np.clip(cosine_theta, -1.0, 1.0)
        theta = np.arccos(cosine_theta)
        # print(f"The algorithm computed an inner product angle of θ={theta}")
        return theta




## Inner Product and Angle Calculation
---------
Generally inner product is any calculation on a vector space s.t. $f: \mathbb{R}^n \to \mathbb{R}$

The inner product between two vectors $\mathbf{x}$ and $\mathbf{y}$ in a vector space can be used to calculate the angle $\theta$ between them using the formula:

In this project $\mathbf{x}$ and $\mathbf{y}$ signify the $\mathbf{AT}$ and $\mathbf{CG}$ sides respectively. These are the vectors that correspond to each vertical column of information in the DNA.

$$
\theta = \arccos\left( \frac{\mathbf{x} \cdot \mathbf{y}}{\|\mathbf{x}\| \|\mathbf{y}\|} \right)
$$

Where:
- $\mathbf{x} \cdot \mathbf{y}$ denotes the dot product (or inner product) between vectors $\mathbf{x}$ and $\mathbf{y}$.
- $\|\mathbf{x}\|$ and $\|\mathbf{y}\|$ represent the norms (or magnitudes) of vectors $\mathbf{x}$ and $\mathbf{y}$, respectively.

The dot product $\mathbf{x} \cdot \mathbf{y}$ is calculated as:

$$
\mathbf{x} \cdot \mathbf{y} = \sum_{i=1}^{n} x_i y_i
$$

Where $x_i$ and $y_i$ are the components of vectors $\mathbf{x}$ and $\mathbf{y}$, respectively.

The norms $\|\mathbf{x}\|$ and $\|\mathbf{y}\|$ are calculated as:

$$
\|\mathbf{x}\| = \sqrt{\sum_{i=1}^{n} x_i^2}
$$
$$
\|\mathbf{y}\| = \sqrt{\sum_{i=1}^{n} y_i^2}
$$

This was implemented in this project by using angles closer to 0 to signify a relatedness in vectors that was positive.


## Fitness
-----------
Fitness is calculated by the $\theta$ value that is closest to $0.0$ based on the intial population.

The fitness is used to optimize for the angle to find a solution of the most connected DNA strand possible.

## Crossover
-------------
This function crosses two parents DNA and produces two children. This function occurs in nature during meiosis when chromosomes are lined up, which in this case are the $\mathbf{AT}$ and $\mathbf{CG}$ vectors respectively.

In [559]:
def crossover(sequence1, sequence2):
    currL1 = sequence1.headL
    currL2 = sequence2.headL

    while currL1 is not None and currL2 is not None:
        # Swap the entry values in the left backbone nodes
        temp_entry = currL1.right.entry
        currL1.right.entry = currL2.right.entry
        currL2.right.entry = temp_entry
        
        # Move to the next nodes in both sequences
        currL1 = currL1.next
        currL2 = currL2.next

    return sequence1, sequence2

In [560]:
new_DNA1 = DNA()
new_DNA1.insert_from_front(1,1)
new_DNA1.insert_from_front(1,1)
new_DNA2 = DNA()
new_DNA2.insert_from_front(2,2)
new_DNA2.insert_from_front(2,3)

print("Example crossover of Genes")
print()
print('''

O       o O       o O       o
| O   o | | O   o | | O   o |
| | O | | | | O | | | | O | |
| o   O | | o   O | | o   O |
o       O o       O o       O


''')
print("Normal Sequence 1: ")
new_DNA1.print_dna()

print("Normal Sequence 2: ")
new_DNA2.print_dna()

new_DNA3, new_DNA4 = crossover(new_DNA1,new_DNA2)
print("Crossover Sequence 1 ")
new_DNA3.print_dna()

print("Crossover Sequence 2 ")
new_DNA4.print_dna()



      
      
      
      
      

Example crossover of Genes



O       o O       o O       o
| O   o | | O   o | | O   o |
| | O | | | | O | | | | O | |
| o   O | | o   O | | o   O |
o       O o       O o       O



Normal Sequence 1: 
|== 1 == 1 ==|
|== 1 == 1 ==|
Normal Sequence 2: 
|== 2 == 3 ==|
|== 2 == 2 ==|
Crossover Sequence 1 
|== 2 == 1 ==|
|== 2 == 1 ==|
Crossover Sequence 2 
|== 1 == 3 ==|
|== 1 == 2 ==|


## Intialize Population
-----------
This creates a population with a predetermined population size populated with DNA strands.

In [561]:
def initialize_population(pop_size, dna_length):
    
    # This is a list comprehension for making a population of DNA
    population = [DNA() for _ in range(pop_size)]
    for dna in population:
        for _ in range(dna_length):
            left_val = random.randint(1, 10)  # Example: random initialization of values
            right_val = random.randint(1, 10)
            dna.insert_from_front(left_val, right_val)
    return population

## Evaluating Fitness
-----------
Evaluating fitness stores the inner value product with the strand.

In [562]:
def evaluate_fitness(population):
    '''This is evaluating the fitness of the population'''
    for dna in population:
        fitness_value = dna.get_inner_product()
        #We are using the inner product value to determine the fitness
        # In this the lower the score indicates a better fitness for surviing
        # A lower score means it is more closely related
        dna.fitness = fitness_value

## Selection 
-------------
Selection takes the top 75% most fit of the population and creates a new population with them. This ensures that each next population is generally more fit than the previous generation.

In [563]:
def selection(population, pop_size):
    sub_pop = math.floor(pop_size*0.75)
    # We are sorting by fitness here
    sorted_population = sorted(population, key=lambda x: x.fitness)
    # as a result we are taking 75 percent of the population
    return sorted_population[:sub_pop]


## Mutation
----------------------

Mutation uses a standard mutation rate of 5% or a random rate if smaller against random values to determine the percentage of the population that is mutated.

If random.random() generates a number less than mutation_rate, the condition evaluates to True, indicating that a mutation should occur.

If random.random() generates a number greater than or equal to mutation_rate, the condition evaluates to False, indicating that no mutation should occur.

The mutation then performs a multiplication of the curent entry against a value between 0.1 and 1.0 which will keep it inclusive within the sin domain.

In [564]:
def mutate(dna, mutation_rate):
    if random.random() < mutation_rate:
        # mutation_position = random.randint(0, len(dna_length) - 1)  # Select a random position in the DNA sequence
        # curr_dna = dna[mutation_position]
        currL = dna.headL
        currR = dna.headR
        while (currL != None and currR != None):
            #This finds a random value between 0.1 and 1.0 included up to 10 decimal places
            rand1 = round(random.uniform(0.1,1.0), 10)
            currL.right.entry = ((currL.right.entry)*rand1)
            currR.left.entry = ((currR.left.entry)*rand1)
            currL = currL.next
            currR = currR.next

## Complete Genetic Algorithm
----------
This section utilizes all the previous methods and the DNA data structure to construct a genetic algorithm that uses generations to tend towards a more fit population. 

The length of a DNA strand, the size of the population, the number of generations, and the mutation rate are all passed as parameters.
From here an intial population is generated and then and then from that the fitness of each strand is evaluated and parents are generated. 
Next, the parents are bread to produce two children and the children are possibly mutated and then appended to the new population.
From this new population we repreat this until the end of the selected number of generations. 

Over this time the population grows more fit and the most fit is then selected from the final population completing the genetic algorithm.

In [590]:
def genetic_algorithm(dna_length,pop_size, num_generations, mutation_rate):
    print("Beginning Genetic Algorithm.....")
    print('''

`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_


''')
    population = initialize_population(pop_size, dna_length)

    for generation in range(num_generations):
        evaluate_fitness(population)
        parents = selection(population, pop_size)
        new_population = []
        while len(new_population) < pop_size:
            parent1 = random.choice(parents)
            parent2 = random.choice(parents)
            child1, child2 = crossover(parent1, parent2)
            mutate(child1, mutation_rate)
            mutate(child2, mutation_rate)
            new_population.append(child1)
            new_population.append(child2)
        population = new_population
          # Return the best individual 
    best_individual = max(population, key=lambda x: x.fitness)
    best_output = best_individual.print_dna()
    print()
    print("Algorithm Complete...")
    return best_individual.get_inner_product()

## Fitness Over Generations
------- 
Based on the results below you can see how as the number of generations increase so does the fitness level, leading to enhacned optimization.

In [605]:
pop_size = 10
num_generations = 1
mutation_rate = 0.05
dna_length = 3
final_gene = genetic_algorithm(dna_length, pop_size, num_generations,mutation_rate)
print(f"The fittest sequence had a inner product of {final_gene} which means that the DNA consists of the most closely related data by way of angle.")


Beginning Genetic Algorithm.....


`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_



|== 10 == 8 ==|
|== 10 == 1 ==|
|== 2 == 4 ==|

Algorithm Complete...
The fittest sequence had a inner product of 0.7038215280488772 which means that the DNA consists of the most closely related data by way of angle.


In [607]:
pop_size = 10
num_generations = 10
mutation_rate = 0.05
dna_length = 3
final_gene = genetic_algorithm(dna_length, pop_size, num_generations,mutation_rate)
print(f"The fittest sequence had a inner product of {final_gene} which means that the DNA consists of the most closely related data by way of angle.")


Beginning Genetic Algorithm.....


`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_



|== 0.019511565551648923 == 0.11448363568182796 ==|
|== 0.31420924601684946 == 0.4016100604044393 ==|
|== 0.005975126347673892 == 0.013430727610163598 ==|

Algorithm Complete...
The fittest sequence had a inner product of 0.2160081660534121 which means that the DNA consists of the most closely related data by way of angle.


In [614]:
pop_size = 10
num_generations = 100
mutation_rate = 0.05
dna_length = 3
final_gene = genetic_algorithm(dna_length, pop_size, num_generations,mutation_rate)
print(f"The fittest sequence had a inner product of {final_gene} which means that the DNA consists of the most closely related data by way of angle.")


Beginning Genetic Algorithm.....


`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_



|== 1.820748506567055e-17 == 4.85532935084548e-17 ==|
|== 3.049975631783936e-19 == 1.0674914711243775e-18 ==|
|== 7.454106823647831e-15 == 1.4908213647295663e-14 ==|

Algorithm Complete...
The fittest sequence had a inner product of 0.0008147751864158268 which means that the DNA consists of the most closely related data by way of angle.
