In [17]:
# make sure to have pip installed in your system 
!pip install import_ipynb 
import import_ipynb 
import GLV_Mthd 





In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import numpy as np
import random 
import csv
import statistics as stat
import math
import time


In [None]:
# %load C:/Users/Patron/Desktop/model_project/Lab/genDataFromGLV/model_sim.py
#!/usr/bin/env python3
"""
Created on Thu Jun  4 10:52:38 2020

@author: chenzhengyi
"""



def a_matrices( aijs, num_species, num_selected, num_matrices, vari, save =False ):
    '''This function takes in a list of gLV parameter aij and other info
    It generates a list of matrices of the same size
    
    size is n*n where n = num_selected
    list size is specified by num_matrices(num of matrices you want to generate)
    
    vari:level of variation, % of stdev of all aij used to sample perturbed coeff; when perturb the output coeff, sample from a normal distribution with the original interaction coeff as mean and std*vari as the standard deviation
    
    each element of a matrix is an interaction coefficient sampled 
    a from a normal distribution with original aij value as mean and 
    std specified by passed in std
   
    The original gLV matrix will be stored as a comma separated txt file '''
   
    m_list = list()
    #gen mtr with no perturbance 
    a_matrix = GLV_Mthd.to_matrix(  aijs, num_species, num_selected)
    a_matrix = np.array(a_matrix)
    if(save):
        np.savetxt("./interm/gLV_aij.csv", a_matrix, delimiter=',')
    
    sigma = stat.stdev(np.concatenate(a_matrix).flat) #stdev of all interaction coeff
   
    for i in range(num_matrices): #gen a list of perturbed interaction coef mtr  
        mutated = np.random.randn(num_selected,num_selected)*vari*sigma
        mutated = np.add(mutated, a_matrix)
        m_list.append(mutated)        
    
    return m_list
    


def generateCommunity( total_size, minsz =1 , maxsz = -1) :
    '''This function returns a list of tuple
        each tuple represents a community
        each communities has size specified by [min_size, max_size]
        total_size is the total number of participated species (14 max) '''
   
    if maxsz == -1:
        maxsz = total_size
    community = list()
    # generate community from using combination tool
    ls = [i for i in range(total_size)]
    for sz in range(minsz, maxsz+1):
        temp = list(itertools.combinations(ls, sz))
        community = community + temp
   
    return community

def genCommunity_distr(dtr ):
    ''' This function generate a data set of communities based on a passed in distribution 'dtr'
    dtr: a list of 3 percentages [%of small community, %medium, %large] that we want for the data set to have 
    small: 2,3 member communities; medium: 5; large: 8,9 '''
    #create combination of communities 
    #find the limiting community 
    #draw samples without replacement based on limiting community 
    ls = [i for i in range(10)]
    small = list(itertools.combinations(ls, 2))
    small2 = list(itertools.combinations(ls, 3))
    small = small+small2
    med = list(itertools.combinations(ls, 5))
    large = list(itertools.combinations(ls, 8))
    large2 = list(itertools.combinations(ls, 9))
    large = large+large2
    pool = [small, med, large]
    
    nums = np.zeros((3,3)) #how many data points we want from each group
    sz = np.array([len(small), len(med), len(large)])
    for i in range(len(dtr)):
        unit = sz[i] / dtr[i]
        for j in range(len(dtr)):
            if( j!=i):
                nums[i,j] = unit* dtr[j]
            else:
                nums[i,j] = sz[i]
    rowCt = 2
    while(rowCt>=0):
        if (np.any(np.subtract(sz,nums[rowCt])<0 )):
            nums = np.delete(nums, rowCt, axis=0)
        rowCt-=1
    idx = 0 
    if(nums.shape[0]>1):
        idx = np.argmax(np.sum(nums,axis=1))
    finalSz = np.floor(nums[idx])
    community = []
    for i in range(3):
        cm = np.random.choice(np.array(len(pool[i])), int(finalSz[i]), replace=False)
        community = community+ [pool[i][j] for j in cm]
    
    return community 
            
        
    
        


In [None]:


'''variables
        num_select: number of participated species chosen from the 14
        number of mutated matrices
        variance of the uniform distribution when sampling mutated matrices' parameters
        community max and min member (max<= num_select)
        output data max size 
        file names to be saved '''


#species information set up ------------------------------------------------
num_species = 14
num_select = 10 #run model with 10 species, can change up to 14 
sp_dict = {1:'CD',2:	'ER',3:	'DP',4:	'FP',5:	'BH',6:	'CA',7:'PC',
           8:'EL',9:'CH',10:	'BO', 11:'BT', 12:'BU', 13:'BV', 14:'CS'}
color_map = {"ABB":"#1f78b4","BH":"orange","CA":"yellow","BU":"teal",
             "PC":"#a6cee3","BO":"#ff0000","BV":"#b2df8a","BT":"#33a02c",
             "EL":"#fb9a99","FP":"#b31a1c","CH":"#fdbf6f","DP":"#ff7f00",
             "ER":"#cab2d6","CS":"#6a3d9a","CD":'black'}
#interaction parameter matrix csv file output path 
output_path = "a_matrix_" + str(num_select) + ".csv"


#load parameters from Susan's data ------------------------------------------ -----------------------------------------------------------------------------
param_path = 'gLVParamAll.csv'
params_df = pd.read_csv(param_path, header=None, index_col=None)
aijs = params_df[0].tolist()
u_vector = aijs[0::(num_species +1)] #store growth rate 
u_vector = u_vector[0: num_select] #select growth rate
del aijs[0::(num_species + 1)] #delete growth rate from interactoin coeffs 


#Initialize other parameters to generate the training set #-----------------------------------------------------------------------------
my_initial_condition = [0.01 for i in range(num_select)] #init OD 
my_time = [i for i in range(0,72)]  


In [None]:
class GenData:
    def __init__(self, u_vector, initial_condition, aijs, community, n_selected, my_time, sp_dict):
        self.uv = u_vector #growth rate
        self.initCond = initial_condition
        self.aijs = aijs # a_matrices list
        self.community = community
        self.n = n_selected #number of selected species 
        self.mytime = my_time
        self.sp_dict = sp_dict

  

    
    def genData(self, n_tr,  vari, saveAij =False):
        '''This function output training sets with size of n_tr
        parameters:
            n_tr: num of training set 
                if no constrain, set as -1 and output n_tr as num of community 
            comm_sz: (not num_selected), number of members in a community 
            saveAij: boolean value to set if you want to save the original gLV coeff matrix 
        return: 
            output: a n_tr *n_selected np array 
            each row represents the end point abundance of each species in a        community 
            '''      
        
        #calculate number of mtr needed and generate mtr list 
        n_tr_singleIter = len(self.community)
        if(n_tr != -1):
            num_mtr = math.ceil(n_tr / n_tr_singleIter)

            
        else:
            n_tr = len(self.community)
            num_mtr = 1
            
        aml = a_matrices( self.aijs, num_species, self.n, num_mtr, vari, save = saveAij ) #a_mtr list
        output = np.zeros((n_tr, self.n )) 
        
        ind = 0  

        for mtr in range(len(aml)): #generating end OD data from perturbed gLV coeff matrices 
               
            for cm in self.community:
                if(ind>= n_tr):
                    break
                
                cm_n = len(cm)
                cm_initial_condition= [self.initCond[i] for i in cm]
                cm_u = np.array([self.uv[i] for i in cm])
                cm_a= np.zeros((cm_n, cm_n))
                for i in range(0, cm_n):
                    for j in range(0, cm_n):
                        cm_a[i][j] = aml[mtr][cm[i]][cm[j]]
                sim_data = GLV_Mthd.simulate(cm_initial_condition, self.mytime, cm_u, cm_a)
                sim_df = pd.DataFrame(sim_data)
                
                
                for k in range(cm_n):
                    
                    output[ind][cm[k]] = sim_df[k][self.mytime[-2]]
                    ##############print plot################################
                    # if(cm_n == self.n):
                    #     mysp = self.sp_dict[cm[k]+1]
                    #     plt.figure(fig)
                    #     plt.plot(my_time, sim_df[k], label = mysp,color=color_map[mysp])
                        
                    #     plt.xlabel('time hrs')
                    #     plt.ylabel('OD600')
                    #     plt.legend()
                # fig+=1
                ind+=1
        
        
        return output 
    
    
def communityDistrPipe( num_species, num_selected, dtr, mtr_vari, n_tr, nameIdx, save , outpath, saveAij_local= False):
    
    """this functino generate species end OD data in a csv file when % small, medium and large community distribution is specified 
    num_species = 14 based on Susan's data
    num_selected: number of distinct species participated in the modeling for all communities
    dtr: a list of 3 percentages [%of small community, %medium, %large] that we want for the data set to have 
    small: 2,3 member communities; medium: 5; large: 8,9
    mtr_vari: level of variance for the gLV parameter when running the model
    n_tr: number of training set 
    
    nameID: used to save the file (text)
    save: boolean to set if you want to save the community distribution information 
    outpath: location to save the file 
    saveAij_local: boolean value to set if you want to save the gLV interaction coefficient mtr used to generate the raw data in this model 
    
    """
    
    np.random.seed(10)
    community = genCommunity_distr(dtr)
    data2 =GenData(u_vector, my_initial_condition, aijs, community, num_select, my_time, sp_dict )
    output = data2.genData(n_tr, mtr_vari, saveAij = saveAij_local )
    
    if (save):
        
        cmInfo2 = outpath + f'outDtr{nameIdx}'
        outname2 = cmInfo2+ f'_{mtr_vari}v.csv'
        np.savetxt(outname2, output, delimiter=',', newline='\n')
    
    return output #a np mtr



def modelPipe( num_species, num_select, cm_sz_min, cm_sz_max, mtr_vari, n_tr, saveAij_local = False):
    """ This function generate end OD data in a csv file when no community distribution (%of small, medium and large community) specified 
    num_species = 14 based on Susan's data
    num_selected: number of distinct species participated in the modeling for all communities
    cm_sz_min and max: minimal and maximal number of species in each community 
    mtr_vari: level of variance for the gLV parameter when running the model
    n_tr: number of training set """
    
    np.random.seed(10)
    startgen = time.time() #used to track time taken to generate the output 
    #list of distinct communities based on permutation 
    community = generateCommunity(num_select, minsz = cm_sz_min, maxsz = cm_sz_max)
    
    
    if(n_tr < len(community) and n_tr > 0):#random sample some community if smaller sample size is required 
        community = random.sample(community, n_tr)
        
    data1 = GenData(u_vector, my_initial_condition, aijs, community, num_select, my_time, sp_dict)
    # a (number of training set) *(num_select) matrix for output data (np 2d array)
    output = data1.genData(n_tr, mtr_vari, saveAij = saveAij_local)
    
    endtime = time.time()
    timeStr2 = f'time used to generate data: {endtime - startgen : .2f}'
    print(timeStr2)
    
    return output 
    

  
