### Prologue: Importing packages

In [1]:
import numpy as np
import copy as cp
import scipy, os, time
import random
import params_file as pf
from scipy import stats
import math
import matplotlib.pyplot as plt
import dill
from datetime import datetime,date
%matplotlib inline

In [3]:
import cProfile #This is to benchmark the code. Recommended by Djole.

In [4]:
from concurrent.futures import ProcessPoolExecutor # for using multiple cores.

# Chapter 1: Global declarations

In [5]:
dna_codons=np.array(['ATA', 'ATC', 'ATT', 'ATG', 'ACA', 'ACC', 'ACG', 'ACT', 'AAC',
       'AAT', 'AAA', 'AAG', 'AGC', 'AGT', 'AGA', 'AGG', 'CTA', 'CTC',
       'CTG', 'CTT', 'CCA', 'CCC', 'CCG', 'CCT', 'CAC', 'CAT', 'CAA',
       'CAG', 'CGA', 'CGC', 'CGG', 'CGT', 'GTA', 'GTC', 'GTG', 'GTT',
       'GCA', 'GCC', 'GCG', 'GCT', 'GAC', 'GAT', 'GAA', 'GAG', 'GGA',
       'GGC', 'GGG', 'GGT', 'TCA', 'TCC', 'TCG', 'TCT', 'TTC', 'TTT',
       'TTA', 'TTG', 'TAC', 'TAT', 'TGC', 'TGT', 'TGG','TAA','TAG','TGA'], dtype=object)

In [6]:
trans_aas=np.array(['I', 'I', 'I', 'M', 'T', 'T', 'T', 'T', 'N', 'N', 'K', 'K', 'S',
       'S', 'R', 'R', 'L', 'L', 'L', 'L', 'P', 'P', 'P', 'P', 'H', 'H',
       'Q', 'Q', 'R', 'R', 'R', 'R', 'V', 'V', 'V', 'V', 'A', 'A', 'A',
       'A', 'D', 'D', 'E', 'E', 'G', 'G', 'G', 'G', 'S', 'S', 'S', 'S',
       'F', 'F', 'L', 'L', 'Y', 'Y', 'C', 'C', 'W','_','_','_'], dtype=object)

In [7]:
data_fields=np.array(["generation","genome","proteome","grn","thresholds","decays","start_vect","development","genes_on","fitness"])

# Chapter 2. Main functions to create an organism and a population

Each organism is a ragged numpy array combining several other numpy arrays, with the following contents:
* \[0\] generation number (always '0' for the founder)
* \[1\] the genome
* \[2\] the proteome
* \[3\] the grn
* \[4\] the expression threshold levels
* \[5\] the decay lambdas
* \[6\] the starting vector
* \[7\] the development
* \[8\] the 'genes_on' vector
* \[9\] the fitness

The output now is a population i.e. an array where each 'row' is an organism array, and because this is a single organism only, it has only one 'row' - a single first dimension. This should make sense later.

In [8]:
def founder_miner(min_fitness=0.6):
    fitness=0
    while fitness < min_fitness:
        # Importing values for producing the genomic sequences
        n_generation=0
        n_genes=pf.num_genes
        seq_len=pf.seq_length
        genome,proteome=makeGenomeandProteome(seq_len,n_genes,dna_codons,trans_aas)
        #print(genome)
        # Importing the values for producing all the regulatory information.
        prop_off=pf.prop_unlinked # thresholds and decays will have the converse of this probability as 0s. See blow.
        thresh_boundaries=pf.thresh_boundaries # tuple of 2 values.
        decay_boundaries=pf.decay_boundaries # tuple of 2 values.
        grn=makeGRN(n_genes,prop_off)
        thresholds=randomMaskedVector(n_genes,(1-prop_off),thresh_boundaries[0],thresh_boundaries[1])
        decays=randomMaskedVector(n_genes,(1-prop_off),decay_boundaries[0],decay_boundaries[1])
        # Importing values for the developmental info
        dev_steps=pf.dev_steps
        start_vect=(lambda x: np.array([1]*1+[0]*(x-1)))(n_genes)
        development=develop(start_vect,grn,decays,thresholds,dev_steps)
        genes_on=(development.sum(axis=0) != 0).astype(int)
        #print(f"Current fitness {fitness} is lower than minimum {min_fitness}")
        fitness=calcFitness(development)
        out_arr=np.array([np.array((n_generation,genome,proteome,grn,thresholds,decays,start_vect,development,genes_on,fitness),dtype=object)])
    return(out_arr)

The translate_codon() function well...it takes a codon (in DNA format) and outputs the corresponding amino acid. I've also re-written it in lambda-function format (now deleted), out of curiosity mainly, because changing it in the code I think will result in reduced readability.

In [9]:
def translate_codon(codon):
    idx=np.where(dna_codons == codon)[0][0]
    aminoac=trans_aas[idx]
    return(aminoac)

# Chapter 3
## Support functions for making an organism from scratch

#### GRN RELATED
The makeGRN() function below will create a GRN as a numpy array of random regulatory interactions, with a user-defined proportion of interactions set to zero (the "unlinked" ones).

In [10]:
def makeGRN(numGenes,prop_unlinked):
    grn = randomMaskedVector(numGenes ** 2,prop_unlinked,pf.new_link_bounds[0],pf.new_link_bounds[1])
    grn = grn.reshape(numGenes,numGenes)
    return(grn)

#### SEQUENCE RELATED
**Below _were_ the TWO functions that create the sequence arrays. Now they're numpy string arrays of n genes by m codons.**
Must trace back the function calls and correct any possible bugs. _makeRandomSequence()_, and _makeRandomSequenceArray()_.

In [11]:
def makeGenomeandProteome(seq_length,num_genes,dna_codons=dna_codons,trans_aas=trans_aas):
    if seq_length % 3:
#        print("Sequence length",seq_length,"is not a multiple of 3.")
        seq_length = seq_length - (seq_length % 3)
        num_codons = int(seq_length/3)
#        print("Rounding to", seq_length,"for",num_codons,"codons")
    else:
        num_codons=int(seq_length/3)
    idx_vect=np.array(range(0,len(dna_codons)-3))
    genome_arr=np.empty((num_genes,num_codons),dtype=object)
    proteome_arr=np.empty((num_genes,num_codons),dtype=object)
    for i in range(0,num_genes):
        rand_codon_idx=np.hstack((np.random.choice(idx_vect,(num_codons-1)),np.random.choice((61,62,63),1)))
        #len(rand_codons)
        genome_arr[i]=np.array(dna_codons[rand_codon_idx])
        proteome_arr[i]=np.array(trans_aas[rand_codon_idx])
    return(genome_arr,proteome_arr)

#### OTHER SUPPORTING FUNCTIONS

In [12]:
# Function that creates a vector of a given amount of values (within a given range), in which a certain proportion of the values are masked.
def randomMaskedVector(num_vals,prop_zero=0,min_val=0,max_val=1):
    if min_val > max_val:
        print("Error: minimum value greater than maximum value")
        return
    range_size = max_val - min_val
    if prop_zero == 0:
        rpv = np.array(range_size * np.random.random(num_vals) + min_val)
    else:
        mask = np.random.choice((0,1),num_vals,p=(prop_zero,1-prop_zero))
        rpv = np.array(range_size * np.random.random(num_vals) + min_val)
        rpv = (rpv * mask) + 0
    return(rpv)

# Chapter 4:
## Mutation Functions

### Genome Mutation
Essentially, what genome_mutation() does now is:
* Takes in the genome and proteome that will be mutated
* Takes in an array with 3D coordinates for each mutation, in the format \[gene_number,codon_number_ingene,codon_position\]
* For each mutation:
   1. mutate the correct nucleotide
   2. translate the mutated codon
   3. determine what kind of mutation it was (nonsense, non-syn, syn)
   4. add the gene number and the mutation type into the muttype_vect object
* Once this is done, the function outputs the mutated genome, the mutated proteome, and the array that says what type of mutation happened where (nonsense = 0, nonsyn=1, syn=2).

In [13]:
def mutate_genome(gnome,prome,mut_coords):
    gnome=cp.deepcopy(gnome)
    prome=cp.deepcopy(prome)
    mut_num=mut_coords.shape[0] #get the number of rows in the mutation coordinate array, this is the number of mutations
    muttype_vect=np.ndarray((mut_num,2),dtype=object)
    for i in range(mut_num):
        coordinates=mut_coords[i,:]
        #print(coordinates)
        selected_gene=coordinates[0]
        selected_codon_from_gene=coordinates[1]
        selected_codpos=coordinates[2]
        #print((selected_gene,selected_codon_from_gene),selected_codpos)
        selected_codon=gnome[selected_gene,selected_codon_from_gene]
        prev_aacid=translate_codon(selected_codon)
        mutated_codon=pointMutateCodon(selected_codon,selected_codpos)
        gnome[selected_gene,selected_codon_from_gene]=mutated_codon
        new_aacid=translate_codon(mutated_codon)
        if prev_aacid == new_aacid: #Synonymous mutations are plotted as '2'
            muttype=2
        elif new_aacid == "_": # Nonsense mutations are plotted as '0'
            muttype=0
        else: # Nonsynonymous mutations are plotted as '1'
            muttype=1
        prome[selected_gene,selected_codpos]=new_aacid
        muttype_vect[i]=(selected_gene,muttype)
    out_genome=gnome
    out_proteome=prome
    return(out_genome,out_proteome,muttype_vect)

#### codPos function
This function takes in an array of mutation sites (with reference to the whole genome - i.e., a genome of 5 genes with 500 codons each will result in numbers ranging from 0 to 5\*500*3=7500, and outputs a 3D array that, for each mutation, will give a 3-number coordinate in the format: \[gene_number,codon_number(in gene),codon_position]. This will then be moved into the mutation function, and each mutated base can be accessed through genome[gene_number,codon_number][codon_position]. Also it has the obvious benefit of pinpointing exactly where each mutation is occurring.
In the end, I decided to produce 'keys': arrays of the same size of the genome, that for each base have, respectively, the gene number, the codon number in the gene, and the codon position.

In [14]:
def codPos(muts,num_genes,num_codons):
    #base1=num+1
    out_array=np.ndarray((muts.size,3),dtype=object)
    gene_bps=num_codons*3
    genome_bps=gene_bps*num_genes
    genenum_array=np.ndarray((num_genes,gene_bps),dtype=object)
    for i in range(num_genes):
        genenum_array[i,:]=i
    genenum_array=genenum_array.flatten()
    #print("genenum_array:",genenum_array)
    codpos_array=np.tile([0,1,2],num_codons*num_genes)
    #print("codpos_array:",codpos_array)
    codnum_array=np.ndarray((num_genes,gene_bps),dtype=object)
    for i in range(num_genes):
        codnum_array[i,:]=np.repeat(range(num_codons),3)
    codnum_array=codnum_array.flatten()
    #print("codnum_array:",codnum_array)
    for i in range(muts.size):
        basenum=muts[i]
        mut_val=np.array([genenum_array[basenum],codnum_array[basenum],codpos_array[basenum]])
        out_array[i,:]=mut_val
    return(out_array)
    

In [15]:
def randomMutations(in_genome,mut_rateseq):
    total_bases=in_genome.size*3 #Each value in the genome is a codon, so the whole length (in nucleotides) is the codons times 3
    mutations=np.random.choice((0,1),total_bases,p=(1-mut_rateseq,mut_rateseq))
    m=np.array(np.where(mutations != 0)).flatten()
    if m.size:
        output=m
    else:
        output=False
    return(output)

In [16]:
# Input is an organism array, as produced by the founder_miner() function, and the mutation rate of the nucleotide sequence (i.e. mutation probability per base).
def mutation_wrapper(orgarr,mut_rateseq):
    orgarrcp=cp.deepcopy(orgarr[0])
    in_gen_num=orgarrcp[0]
    in_genome=orgarrcp[1]
    in_proteome=orgarrcp[2]
    in_grn=orgarrcp[3]
    in_thresh=orgarrcp[4]
    in_decs=orgarrcp[5]
    in_start_vect=orgarrcp[6]
    in_dev=orgarrcp[7]
    in_genes_on=(in_dev.sum(axis=0) != 0).astype(int)
    in_fitness=orgarrcp[9]
    mutations=randomMutations(in_genome,mut_rateseq)
    #print(mutations)
    if np.any(mutations):
        mut_coords=codPos(mutations,in_genome.shape[0],in_genome.shape[1])
        #print(mut_coords)
        out_genome,out_proteome,mutlocs=mutate_genome(in_genome,in_proteome,mut_coords)
        out_grn,out_thresh,out_decs=regulator_mutator(in_grn,in_genes_on,in_decs,in_thresh,mutlocs)
        out_dev=develop(in_start_vect,out_grn,out_decs,out_thresh,pf.dev_steps)
        out_genes_on=(out_dev.sum(axis=0) != 0).astype(int)
        out_fitness=calcFitness(out_dev)
    else:
        out_genome=in_genome
        out_proteome=in_proteome
        out_grn=in_grn
        out_thresh=in_thresh
        out_decs=in_decs
        out_dev=in_dev
        out_genes_on=(out_dev.sum(axis=0) != 0).astype(int)
        out_fitness=in_fitness
    out_gen_num=in_gen_num+1
    out_org=np.array([[out_gen_num,out_genome,out_proteome,out_grn,out_thresh,out_decs,out_genes_on,out_dev,out_genes_on,out_fitness]],dtype=object)
    return(out_org)

In [17]:
def pointMutateCodon(codon,pos_to_mutate):
    bases=("T","C","A","G")
    base=codon[pos_to_mutate]
    change = [x for x in bases if x != base]
    new_base = np.random.choice(change)
    split_codon=np.array(list(codon))
    split_codon[pos_to_mutate]=new_base
    new_codon="".join(split_codon)
    return(new_codon)

### GRN mutation

##### weight_mut():
This function was built for the special case of when all the genes are expressed in the genome, so no *actual* synonymous mutation can be done. In such cases, I'll assume that a synonymous mutation is simply a **very minor** change in the GRN, even in an interaction that exists already. However, I realized that this function can be generalized to do any change, using the arguments to control it. For example, non-synonymous changes could also use the same function, but with a higher scale value and activations can be done by giving as a value the average weight, and adding a small percentage as a scaler.

For this, _weight_mut()_ takes in the value, and gives as a result a random number chosen from a uniform distribution that goes from -value(1/_x_) to value(1/_x_), where x is a scaling factor that by default is 100.

So, a synoymous change in weights would be a random number that will be smaller than a hundredth of the current value. Needless to say, this won't work with links that are 'off' (=0).

In [18]:
def weight_mut(value,scaler=0.01):
    val=abs(value) #Make sure value is positive
    if val == 0:
        '''For values at zero, simply get 1, and then modify it by the scale
        This is for activating thresholds that are 0.'''
        val=scaler/scaler
    scaled_val=val*scaler #scale the value
    newVal=value+np.random.uniform(-scaled_val,scaled_val) #add the scaled portion to the total value to get the final result.
    return(newVal)

In [19]:
def threshs_and_decs_mutator(in_thresh,in_dec,mutarr):
    #print(f"Input thresholds were: {in_thresh}")
    #print(f"Input decays were: {in_dec}")
    #print(f"Input mutarr was:\n{mutarr}")
    the_tuple=(in_thresh,in_dec) # make a tuple in which the threshold array is the first value, and the decays the second.
    # This will allow me to easily choose among them at the time of mutating, see within the for loop.
    num_genes=len(in_thresh) #get the number of genes from the amount of values in the thresholds array
    genes=mutarr[:,0] # get the genes to be mutated from the mutarray's 1st column
    #print(f"The array of genes to be mutated is:\n{genes}")
    for i in np.arange(len(genes)): #go through each gene, and decide randomly whether to make a threshold or a decay mutation in the gene.'''
        tuple_idx=np.random.choice((0,1))
        #print(f"Thresholds = 0, Decays = 1, Random choice was = {tuple_idx}")
        gene_num=genes[i] # extract specific gene number that has to be mutated. This maps to the thresh and dec arrays.
        #print(f"This means that gene {gene_num} will be mutated:\nValue {the_tuple[tuple_idx][gene_num]}")
        new_value=abs(weight_mut(the_tuple[tuple_idx][gene_num]))
        the_tuple[tuple_idx][gene_num]=new_value
        #print(f"...is now {new_value}")
    out_thresh,out_decs=(the_tuple[0],the_tuple[1])
    return(out_thresh,out_decs)

### Regulation mutation function
This function looks quite big because it's in charge of translating the sequence mutations into mutations in any of the regulatory interactions (GRN weights, gene decay rates, gene expression thresholds).
Inputs:
* A parental organism (but could be just the GRN)
* The 'muttype_vect' array
    * Came out of the genome mutator function
        * For each mutation:
            * First col: gene number
            * Second col: type of mutation:
                * 0=nonsense
                * 1=non-synonymous
                * 2=synonymous mutations
                
First off, it decides with a biased coin toss whether any of the mutated genes passed in the muttype_vect array will be thresholds or decays. The coin toss is biased because the thresholds and decay rates represent only a small amount of the regulatory interactions present. Precisely, they represent only 2N/(2N+N^2) of the interactions, where N is the number of genes. In this function, I've algebraically simplified the expression to 2/(2+N). If it ends up choosing any number of mutations to be in the thresholds or the decay rates, it calls the "threshs_and_decs_mutation()" function (declared above), and sends over the chosen mutations there, while removing them from the original muttype_vect, so as not to mutate repeatedly the same genes in the GRN.

Then, it goes through each remaining entry in the muttype_vect and mutates the regulatory link/weight according to the following set of rules:
* If the mutation is nonsense ('0'):
    1. Multiply the column and the row of that gene by 0
* If the gene is ON:
    1. Identify all the nonzero values for that gene
    2. If the mutation is non-synonymous ('1'):
        * mutate a random nonzero value with weight_mut(orig_value,0.5). weight_mut() mutates _orig_value_ by adding or subtracting any amount in between the -_p_ and +_p_, _p_ being in this case 0.5, or __half__ the amount of _orig_value_.
    3. If the mutation is synonymous ('2'):
        * mutate a random nonzero a tiiiiiny little with weight_mut(orig_value,0.001). See? here the mutation will never be more than a thousandth of the value. Tiiiiiiiiiny.
    4. Otherwise return 'None' (in case some botched code sends a value that's neither 0,1,or 2)
* If the gene is OFF:
    1. If the mutation is non-synonymous ('1'):
        * identify all __zero/inactive__ values for that gene
        * turn a random *inactive* value __ON__, choosing the number from the mean expressed value, and randomly choosing the sign
    2. If the mutation is synonymous ('2'):
        * get __all__ values for that gene
        * choose one at random, and mutate it with weight_mut(orig_value,0.5)
    3. Otherwise also return 'None'
* Return the modified grn, decay rates vector, and expression thresholds vector.


In [20]:
mut_kinds=np.array(["nonsense","non-synonymous","synonymous"])

In [21]:
def regulator_mutator(in_grn,genes_on,in_dec,in_thresh,muttype_vect):
    curr_grn=cp.deepcopy(in_grn)
    curr_thr=cp.deepcopy(in_thresh)
    curr_genes_on=cp.deepcopy(genes_on)
    curr_dec=cp.deepcopy(in_dec)
    curr_muttype_vect=cp.deepcopy(muttype_vect)
    inactive_links=np.array(list(zip(np.where(curr_grn == 0)[0],np.where(curr_grn == 0)[1])))
    num_genes=pf.num_genes
    '''I'm adding here a section that decides if any of the mutations will go to the thresholds or the decays.
    If there are any changes that have to happen in the decays and/or thresholds, we can call their mutation
    function. Otherwise we can keep on going.'''
    prop=2/(2+num_genes**2) #proportion of mutable sites that are thresholds OR decays
    hits=np.nonzero(np.random.choice((0,1),len(muttype_vect),p=(1-prop,prop)))[0]
    if hits.size > 0:
        mutsarr=curr_muttype_vect[hits]
        #print(f"Sending mutations:\n{mutsarr} to decays/thresholds")
        out_threshs,out_decs=threshs_and_decs_mutator(in_thresh,in_dec,mutsarr)
        curr_muttype_vect=np.delete(curr_muttype_vect,hits,axis=0)
    else:
        out_threshs,out_decs=curr_thr,curr_dec
    if curr_muttype_vect.size > 0:
        for i in curr_muttype_vect:
            gene=i[0]
            mtype=i[1]
            #print(f"Gene {gene} has mutation type {mut_kinds[mtype]}")
            if mtype != 0: # For all non-KO mutations (i.e. synonymous, and non-synonymous)...
                # Check this block all the way down to the "<>" below
                if curr_genes_on[gene]: # If the gene is ON...
                    active_links=np.array(list(zip(np.nonzero(curr_grn)[0],np.nonzero(curr_grn)[1])))
                    #print(f"Gene {gene} is ON ({curr_genes_on[gene]}).")
                    actives_in_gene=np.concatenate((active_links[active_links[:,1] == gene,:],active_links[active_links[:,0] == gene,:]),axis=0) # get the gene's active links
                    #print(f"Gene {gene}'s active links are:\n{actives_in_gene}, and the gene's cells show:\n {curr_grn[:,gene]} \n and {curr_grn[gene,:]}")
                    #print(f"GRN is:\n{in_grn}")
                    if mtype == 1: # And the mutation is non-synonymous...
                        #print(f"Mutation {mtype} is NS")
                        #print(f"range to be used is range({len(actives_in_gene)})")
                        rand_idx=np.random.choice(np.arange(len(actives_in_gene))) # FIXED # get a random index number for mutating a link
                        coordinates=tuple(actives_in_gene[rand_idx,:]) # get the random link's specific coordinates
                        val=curr_grn[coordinates] # Extract the value that will be mutated.
                        curr_grn[coordinates]=weight_mut(val,0.5) # mutate the value.
                        #print(f"Mutating coordinate {coordinates} of the GRN, currently showing the value {val} to {in_grn[coordinates]}")
                    elif mtype == 2: # If gene is ON, and the mutation is synonymous...
                        #print(f"Mutation {mtype} is S")
                        #print(f"range to be used is range({len(actives_in_gene)})")
                        rand_idx=np.random.choice(np.arange(len(actives_in_gene))) # FIXED # Same as above
                        coordinates=tuple(actives_in_gene[rand_idx,:]) # Same as above
                        val=curr_grn[coordinates] # Same as above
                        curr_grn[coordinates]=weight_mut(val,0.001) # mutate the value by a very small amount.
                        #print(f"Mutating coordinate {coordinates} of the GRN a tiny little only, from {val} to {in_grn[coordinates]}")
                    else:
                        #print(f"Gene {gene} is neither on nor off, its state is {curr_genes_on[gene]}")
                        None
                #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<888>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
                else: # If the gene is OFF...
                    #print(f"Gene {gene} is OFF ({curr_genes_on[gene]}).")
                    if mtype == 1: # And the mutation is non-synonymous
                        #print(f"And gene{gene}'s mutation is NS")
                        inactive_links=np.array(list(zip(np.where(curr_grn == 0)[0],np.where(curr_grn == 0)[1])))
                        inactives_in_gene=np.concatenate((inactive_links[inactive_links[:,1] == gene,:],inactive_links[inactive_links[:,0] == gene,:]),axis=0)
                        rand_idx=np.random.choice(np.arange(len(inactives_in_gene))) # FIXED # Same as above, but with inactives instead
                        coordinates=tuple(inactives_in_gene[rand_idx,:]) # Same as above above             
                        mean_exp_val=np.mean(np.abs(curr_grn[np.nonzero(curr_grn)])) # Mean expression amount
                        sign=np.random.choice((-1,1)) # Randomly choose between negative or positive
                        new_val=mean_exp_val*sign
                        #print(f"Flipping inactive value at coordinate {coordinates} on at level {new_val}")
                        in_grn[coordinates]=new_val
                    elif mtype == 2: # If gene is OFF, and the mutation is synonymous...
                        # check for all active links of the gene
                        active_links=np.array(list(zip(np.nonzero(curr_grn)[0],np.nonzero(curr_grn)[1])))
                        if active_links.size == 0: #If no links are active (such as in a gene that just got KO'd)...
                            all_links=np.array(list(zip(np.where(curr_grn == 0)[0],np.where(curr_grn == 0)[1]))) # Use the inactives to mutate
                        else: # Otherwise mutate any link from that gene (since it's off, all changes are synonymous)
                            inactive_links=np.array(list(zip(np.where(curr_grn == 0)[0],np.where(curr_grn == 0)[1])))
                            all_links=np.concatenate((active_links,inactive_links),axis=0)
                        #all_links=np.concatenate((inactive_links,active_links),axis=0)
                        actives_in_gene=np.concatenate((all_links[all_links[:,1] == gene,:],all_links[all_links[:,0] == gene,:]),axis=0) # get the gene's active links
                        #print(f"range to be used is range({len(actives_in_gene)})")
                        rand_idx=np.random.choice(np.arange(len(actives_in_gene))) # FIXED # get a random index number for mutating a link
                        coordinates=tuple(actives_in_gene[rand_idx,:]) # get the random link's specific coordinates
                        val=curr_grn[coordinates] # Extract the value that will be mutated.
                        #print(f"Mutating coordinate {coordinates} of the GRN, currently showing the value {val}")
                        curr_grn[coordinates]=weight_mut(val,0.5) # mutate the value.
                    else:
                        None            
            else: # If mutation is KO
                curr_grn[gene,:]=0
                curr_grn[:,gene]=0
                out_grn=curr_grn
                curr_genes_on[gene]=0 # Important change to avoid THE bug.
                #print(f"Knocking out gene{gene}: {curr_genes_on[gene]}.")
    else:
        pass
        #print("No mutations in this round")
    out_grn=cp.deepcopy(curr_grn)
    #print("Copying the input GRN deeply")
    #out_dev=develop(in_start_vect, out_grn,out_decs,out_threshs,pf.dev_steps)
    #out_genes_on=(out_dev.sum(axis=0) != 0).astype(int)
    #out_fitness=calcFitness(out_dev)
    return(out_grn,out_threshs,out_decs)

## Development function
Gets the GRN and simulates the development of it through an iterative matrix dot product.

In [22]:
def develop(start_vect,grn,decays,thresholds,dev_steps):
    start_vect = start_vect
#    print(f"Starting with vector: {start_vect}\n and thresholds {thresholds}")
    geneExpressionProfile = np.ndarray(((pf.dev_steps+1),pf.num_genes))
    geneExpressionProfile[0] = np.array([start_vect])
    #Running the organism's development, and outputting the results
    #in an array called geneExpressionProfile
    invect = start_vect
    counter=1
    for i in range(dev_steps):
#      print(f"Development step {counter}")
        decayed_invect = (lambda x, l: x*np.exp(-l))(invect,decays) # apply decay to all gene qties. previously: exponentialDecay(invect,decays)
#        print(f"Shapes of objects to be fed to matmul:\n{grn.shape}\t{decayed_invect.shape}")
        exp_change = np.matmul(grn,decayed_invect) #calculate the regulatory effect of the decayed values.
#        exp_change = myDotProd(grn,decayed_invect) #check my bootleg dot product function
#        print(f"Output of dot product:\n{exp_change}")
        pre_thresholds = exp_change + decayed_invect # add the decayed amounts to the regulatory effects
#        print(f"Result when added:\n{pre_thresholds}")
        thresholder = (pre_thresholds > thresholds).astype(int) # a vector to rectify the resulting values to their thresholds.
#        print(f"Threshold rectifier vector:\n{thresholder}")
        currV = pre_thresholds * thresholder # rectify with the thresholder vect. This step resulted in the deletion of the 'rectify()' function
 #       print(f"Rectifying with the thresholds gives:\n{currV}")
 #      currV = currV
        geneExpressionProfile[(i+1)] = currV
        invect = currV
        counter=counter+1
    return(geneExpressionProfile)

### Fitness functions
The function that calculates the fitness of an organism based on its development, and the accessory functions that calculate parts of it.

In [23]:
def calcFitness(development):
    min_reproducin = pf.min_reproducin
    is_alive = lastGeneExpressed(development,min_reproducin)
    if is_alive:
        genes_on = propGenesOn(development)
        exp_stab = expressionStability(development)
        sim_to_exp = exponentialSimilarity(development)
        fitness_val = np.mean([genes_on,exp_stab,sim_to_exp])
    else:
        fitness_val = 0
    return(fitness_val)
def lastGeneExpressed(development,min_reproducin):
    dev_steps,num_genes = development.shape
    last_col_bool = development[:,(num_genes - 1)] > min_reproducin
    last_val_last_col = development[dev_steps - 1, (num_genes - 1)]
    if last_col_bool.any() and last_val_last_col > 0:
        return_val = True
    else:
        return_val = False
    return(return_val)
def propGenesOn(development):
    genes_on = development.sum(axis=0) > 0
    return(genes_on.mean())
def expressionStability(development):  # I haven't thought deeply about this.
    row_sums = development.sum(axis=1)# What proportion of the data range is
    stab_val = row_sums.std() / (row_sums.max() - row_sums.min()) # the stdev? Less = better
    return(stab_val)
def exponentialSimilarity(development):
    dev_steps,num_genes = development.shape
    row_means = development.mean(axis=1)
    tot_dev_steps = dev_steps
    fitted_line = scipy.stats.linregress(range(tot_dev_steps),np.log(row_means))
    r_squared = fitted_line.rvalue ** 2
    return(r_squared)

### Chapter 5: Population-making functions.
These functions are meant to create a population from an arbitrary number of organisms. Thus, it will take as an input an array of organism arrays and the final total number of organisms in the population, and the reproductive strategy ('equals' means that each organism will be reproduced in equal amounts to fill up the total number of final individuals, 'fitness_linked' means that their ranking following their fitness value will determine the proportion of their offspring present in the final population, and [instert any other desired strategy here])

In [24]:
# Assumes input is a population (i.e. an array of organism arrays), it should crash if it doesn't find 2 dimensions.
def grow_pop(in_orgs,out_pop_size,strategy='equal'):
    in_orgs=cp.deepcopy(in_orgs)
    num_in_orgs=in_orgs.shape[0]
    orgs_per_org=np.array([np.floor_divide(out_pop_size,num_in_orgs)])
    #print(f"Orgs per org is {orgs_per_org}.")
    corr_pop_size=orgs_per_org*num_in_orgs
    #in_orgs=cp.deepcopy(in_orgs)
    #print(f"Making a population out of the {num_in_orgs} organisms given, reproductive strategy is {strategy}.\nEach organism will have {orgs_per_org[0]} offspring, for a total of {corr_pop_size[0]}.")
    if strategy == 'equal':
        orgs_per_org=np.repeat(orgs_per_org,num_in_orgs)
        #print(f"Offspring/organism array:\n{orgs_per_org}")
    elif strategy == 'fitness_linked':
        print("Reproduction is fitness bound.")
        pass
    else:
        print(f"Reproductive strategy {strategy} not recognized")
        raise ValueError("Invalid reproductive strategy")
    counter=0
    out_pop=np.ndarray((corr_pop_size[0],),dtype=object)
    for k in range(num_in_orgs): # taking each input organism and adding the requested offspring to the output population.
        num_offsp=orgs_per_org[k]
        for i in range(num_offsp):
            indiv=mutation_wrapper(in_orgs,0.00001)[0]
            out_pop[counter]=indiv
            #print(f"Producing organism #{counter}")
            counter=counter+1
            #print(np.all(out_pop[counter == out_pop[(counter-1)]]))
    out_pop=cleanup_deads(out_pop) # removing any dead organisms.
    return(out_pop)

In [25]:
def clean_exit():
    now=datetime.now()
    moment=now.strftime("%m-%d-%Y_%H-%M-%S")
    filename="./EvolRun_"+moment+".dill"
    dill.dump_session(filename)
    print(f"Your session was saved in {filename}.")

In [26]:
def cleanup_deads(in_pop):
    in_pop=cp.deepcopy(in_pop)
    tot_orgs=in_pop.shape[0]
    fitnesses=np.array([ x[9] for x in in_pop[:] ])
    live_ones=np.nonzero(fitnesses)[0]
    #print(f"current population has {live_ones.size} organisms alive")
    if live_ones.size == tot_orgs:
        out_pop=in_pop
    elif live_ones.size != 0:
        #print(f"{live_ones.size} organisms are dead. Sorry for your loss...")
        out_pop=in_pop[live_ones]
    elif live_ones.size == 0:
        print(f"Your population went extinct. Sorry for your loss.")
        out_pop=np.array([])
    return(out_pop)

In [27]:
def select(in_pop,p=0.1,strategy='high pressure'):
    in_pop=cp.deepcopy(in_pop)
    pop_size=in_pop.shape[0]
    num_survivors=int(pop_size*p)
    if strategy == "high pressure":
        fitnesses=np.array([ x[9] for x in in_pop[:] ])
        out_idcs=np.argpartition(fitnesses,-num_survivors)[-num_survivors:] # returns the **indices** for the top 'num_survivors' fitnesses.
    elif strategy == "low pressure" and p < 0.5:
        fitnesses=np.array([ x[9] for x in in_pop[:] ])
        half=np.floor_divide(pop_size,2)
        top_half=np.argpartition(fitnesses,-half)[-half:]
        out_idcs=np.random.choice(top_half,num_survivors,replace=False)
    elif strategy == "low pressure" and p >= 0.5:
        print(f"Low pressure strategy is not recommended for offspring populations\nresulting from more than half the parental population\nDefaulting to total relaxation of selection...")
        out_idcs=np.random.choice(range(pop_size),num_survivors,replace=False)
    elif strategy == "totally relaxed":
        out_idcs=np.random.choice(range(pop_size),num_survivors,replace=False)
    #print(f"Out population will have indices {out_idcs}")
    out_pop=in_pop[out_idcs]
    return(out_pop)
    

In [28]:
def randsplit(in_pop,out_pop_size):
    in_pop=cp.deepcopy(in_pop)
    inpopsize=in_pop.shape[0]
    idcs_lina=np.random.choice(range(inpopsize),int(inpopsize/2),replace=False)
    idcs_linb=np.array([ rand for rand in np.arange(inpopsize) if rand not in idcs_lina])
    lina=grow_pop(in_pop,out_pop_size,'equal')
    linb=grow_pop(in_pop,out_pop_size,'equal')
    return(lina,linb)

In [29]:
def main():
    founder=founder_miner()
    results_array=np.ndarray(13,dtype=object)
    founder_pop=grow_pop(founder,pf.pop_size,'equal')
    results_array[0]=cp.deepcopy(founder_pop)
    stem_lin1,stem_lin2=randsplit(founder_pop,pf.pop_size)
    results_array[1]=cp.deepcopy(stem_lin1)
    results_array[2]=cp.deepcopy(stem_lin2)
    two_branches=np.array([stem_lin1,stem_lin2])
    n_genslist1=np.array([10000,10000])
    if __name__ == "__main__":
        with ProcessPoolExecutor() as pool:
            result = pool.map(branch_evol,two_branches,n_genslist1)
        
    tip_lin1,tip_lin2=np.array(list(result),dtype=object)
    results_array[3]=tip_lin1
    results_array[4]=tip_lin2
    
    stem_lin3,stem_lin4=randsplit(tip_lin1,pf.pop_size)
    results_array[5],results_array[6]=cp.deepcopy(stem_lin3),cp.deepcopy(stem_lin4)
    stem_lin5,stem_lin6=randsplit(tip_lin2,pf.pop_size)
    results_array[7],results_array[8]=cp.deepcopy(stem_lin5),cp.deepcopy(stem_lin6)
    
    four_branches=np.array([stem_lin3,stem_lin4, stem_lin5, stem_lin6])
    n_genslist2=np.array([10000,10000,10000,10000])
    
    if __name__ == "__main__":
        with ProcessPoolExecutor() as pool:
            result = pool.map(branch_evol,four_branches,n_genslist2)
        
    
    tip_lin3,tip_lin4,tip_lin5,tip_lin6=np.array(list(result),dtype=object)
    results_array[9],results_array[10],results_array[11],results_array[12]=cp.deepcopy(tip_lin3),cp.deepcopy(tip_lin4),cp.deepcopy(tip_lin5),cp.deepcopy(tip_lin6)
    return

In [30]:
def branch_evol(in_pop,ngens):
    in_pop=cp.deepcopy(in_pop)
    if in_pop.size:
        for gen in np.arange(ngens):
            print(f"producing generation {gen}")
            survivors=select(in_pop,pf.prop_survivors,pf.select_strategy)
            next_pop=grow_pop(survivors,pf.pop_size,pf.reproductive_strategy)
            in_pop=next_pop
    else:
        pass
    return(in_pop)

### Helper functions: doing multi-core analyses...
These functions are from a course I did with the ACRC at the UoB, on parallelizing python. They're meant to prove a point and show an example, not blow anyone's mind.

In [None]:
def slow_add(nsecs, x, y):
    print(f"Process {os.getpid()} going to sleep for {nsecs} second(s)")
    time.sleep(nsecs)
    
    print(f"Process {os.getpid()} waking up")
    return(x+y)

In [None]:
inpops=np.array([pop1,pop2])
ingens=np.array([10,5])

In [None]:
if __name__ == "__main__":
    with ProcessPoolExecutor() as pool:
        result = pool.map(branch_evol,inpops,ingens)
        
    ca1,ca2=np.array(list(result),dtype=object)

producing generation 0
producing generation 0
producing generation 1
producing generation 1
producing generation 2
producing generation 2
producing generation 3
producing generation 3
producing generation 4
producing generation 4
producing generation 5
producing generation 6
producing generation 7
producing generation 8
producing generation 9


In [None]:
np.all(ca2[9][1] == ca1[9][1])

False

# TESTING GROUND:
### Make the following steps into a function that produces a population
**[ x[9] for x in F1_pop_array[:] ]**  to access element [9] of all members of the population 'F1_pop_array'

In [31]:
result_arr=main()

producing generation 0
producing generation 0
producing generation 1
producing generation 1
producing generation 2
producing generation 2
producing generation 3
producing generation 3


KeyboardInterrupt: 

In [None]:
org1[0][9]

0.5566625185147221

array([ 0.        ,  0.        ,  0.        ,  1.00109397,  0.        ,
        0.        ,  0.        , -0.42804692,  0.        ,  0.        ])

In [None]:
popx=grow_pop(org1,pf.pop_size,'equal')
popx.shape

(100,)

In [None]:
popy,popz=randsplit(popx,pf.pop_size)

In [None]:
np.any(popy == popz)

  np.any(popy == popz)


False

In [None]:
result=np.ndarray(tip.shape[0],dtype=object)
for i in np.arange(tip.shape[0]):
    result[i]=np.all(tip[0][1] == tip[i][1])
l8r_offsp=np.where(result == False)[0]

In [None]:
arr=np.array([])

In [None]:
arr.size

0

In [None]:
x=np.array(range(20))
np.random.choice(x,5)

array([12,  9,  4,  0,  3])

In [None]:
np.where(tip[4][1] == pop1[4][1])

(array([0, 0, 0, ..., 9, 9, 9]), array([  0,   1,   2, ..., 330, 331, 332]))

In [None]:
np.nonzero(fitnesses)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [None]:
fitnesses[z]

array([0.55773542, 0.55773542, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55773542, 0.55773542,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.56278479, 0.55773542, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55773542, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55773542, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55773542, 0.55787606,
       0.55787606, 0.55787606, 0.55787606, 0.55787606, 0.55787

In [None]:
num_survivors=int(1000*0.01)
np.argpartition(fitnesses,-num_survivors)[-num_survivors:]

array([337, 754, 727, 278, 276, 389, 193, 587, 642, 930])

In [None]:
y[0][7]

array([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         1.00000000e+00],
       [ 0.00000000e+00,  4.90802077e+00,  1.92032851e+00,
         2.72168102e+00,  1.28491513e+00,  2.16789378e+00,
         4.49500815e+00,  1.78714166e+00,  1.32005717e+00,
         3.42566426e+00],
       [-0.00000000e+00,  1.16438972e+01,  6.72315950e+00,
         1.74215939e+00,  6.35888774e+00,  0.00000000e+00,
         1.28271295e+01,  2.79855190e+00,  2.02688099e+00,
         8.08654564e+00],
       [-0.00000000e+00,  3.01725195e+01,  1.67495382e+01,
         5.87656074e+00,  1.93338949e+01,  0.00000000e+00,
         3.98479476e+01,  7.80389738e+00,  2.62741691e+00,
         2.61244769e+01],
       [-0.00000000e+00,  8.90134016e+01,  5.28381176e+01,
         1.62815265e+01,  6.12510852e+01,  6.14982045e+00,
         1.22367573e+02,  2.30224115e+01,  6.42979188e+00,
         7.

In [None]:
[x[9] for x in subsel]

[0.7488968851985286,
 0.748906839833955,
 0.7489042964594449,
 0.7489314685263534,
 0.7489549086396666,
 0.7490591115637336,
 0.7488971958022853,
 0.7489151224345402,
 0.7489176914659404,
 0.7489439055554271]

In [None]:
x=np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
c=np.random.choice(x,4,replace=False)
c

array([3, 6, 5, 7])

In [None]:
[ g for g in x[:] if g not in c ]

[0, 1, 2, 4, 8, 9]