用Python实现随机产生个个体的函数gen_individuals

In [1]:
def gen_individuals(k, gen_num, input_data, featureIdx, nMax=10):
    """产生k个个体, gen_num表示每个体对应的固定基因数量"""
    indiv_list = []
    gene_list = []
    for e in range(k):
        indiv = {}
        gene = []
        for i in range(gen_num):
            out = random_get_tree(input_data, featureIdx, nMax) 
            indiv["g"+str(i+1)]=out['f_value']
            gene.append(out['tree_exp'])
        indiv = pd.DataFrame(indiv)
        indiv_list.append(indiv)
        gene_list.append(gene)
    return {"df":indiv_list, "gene": gene_list}

In [2]:
def random_get_tree(input_data,featureIdx,nMax=10):
    """ 
    从原始数据特征中，随机获取特征表达树  
    featureIdx: 原始特征的下标数值，最小从1开始
    nMax:一次最多从特征中可放回抽样次数，默认为10
    """
    data = pd.DataFrame({"X"+str(e):input_data.iloc[:,(e-1)].values for e in featureIdx})
    
    # 随机抽取N个特征下标
    N = random.choice(range(2,nMax+1))
    
    # 随机决定是使用满二叉树还是偏二叉树
    if random.choice([0,1]) == 1:
        # 选择满二叉树
        select_feature_index = [random.choice(featureIdx) for i in range(N)]+[0]*int(2**np.ceil(np.log2(N)) - N)
        random.shuffle(select_feature_index)
        select_feature_index = ['data.X'+str(e)+".values" if e> 0 else '0' for e in select_feature_index]
        tree_exp = gen_full_tree_exp(select_feature_index)
    else:
        # 选择偏二叉树
        select_feature_index = ['data.X'+str(e)+".values" for e in [random.choice(featureIdx) for i in range(N)]]
        tree_exp =  gen_side_tree_exp(select_feature_index)
    return {"f_value":eval(tree_exp),"tree_exp":tree_exp.replace("data.","").replace(".values","")}

In [3]:
#构建偏二叉树，并生成数学表达式
def gen_side_tree_exp(var_flag_array):
    if len(var_flag_array) == 1:
        return add_one_group(var_flag_array[0])
    else:
        var_flag_array[1] = 'g('+random.choice(two_group)+','+add_one_group(var_flag_array[0])+','+add_one_group(var_flag_array[1])+')'
        del var_flag_array[0]
        return gen_side_tree_exp(var_flag_array)

In [4]:
# 构建满二叉树，并生成数学表达式
def gen_full_tree_exp(var_flag_array):
    half_n = len(var_flag_array)//2
    middle_array = []
    for i in range(half_n):
        if var_flag_array[i] == '0' and var_flag_array[i+half_n] != '0':
            middle_array.append('g('+random.choice(one_group)+','+add_one_group(var_flag_array[i+half_n])+')')
        elif var_flag_array[i] != '0' and var_flag_array[i+half_n] == '0':
            middle_array.append('g('+random.choice(one_group)+','+add_one_group(var_flag_array[i])+')')
        elif var_flag_array[i] != '0' and var_flag_array[i+half_n] != '0':
            middle_array.append('g('+random.choice(two_group)+','+add_one_group(var_flag_array[i])+','+add_one_group(var_flag_array[i+half_n])+')')
    if len(middle_array) == 1:
        return add_one_group(middle_array[0])
    else:
        return gen_full_tree_exp(middle_array)

In [5]:
# 随机增加一元运算符
def add_one_group(feature_string, prob=0.3):
    return 'g('+random.choice(one_group)+','+feature_string+')' if np.random.uniform(0, 1) < prob else feature_string

In [6]:
min_number = 0.01

# 一元运算
def log(x):
    return np.sign(x)*np.log2(np.abs(x)+1)

def sqrt(x):
    return np.sqrt(x-np.min(x)+min_number)

def pow2(x):
    return x**2

def pow3(x):
    return x**3

def inv(x):
    return 1*np.sign(x)/(np.abs(x)+min_number)

def sigmoid(x):
    if np.std(x) < min_number:
        return x
    x = (x - np.mean(x))/np.std(x)
    return (1 + np.exp(-x))**(-1)

def tanh(x):
    if np.std(x) < min_number:
        return x
    x = (x - np.mean(x))/np.std(x)
    return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

def relu(x):
    if np.std(x) < min_number:
        return x
    x = (x - np.mean(x))/np.std(x)
    return np.array([e if e > 0 else 0 for e in x])

def binary(x):
    if np.std(x) < min_number:
        return x
    x = (x - np.mean(x))/np.std(x)
    return np.array([1 if e > 0 else 0 for e in x])

# 二元运算
def add(x,y):
    return x + y

def sub(x,y):
    return x - y

def times(x,y):
    return x * y

def div(x,y):
    return x*np.sign(y)/(np.abs(y)+min_number)

two_group = ['add', 'sub', 'times', 'div']
one_group = ['log', 'sqrt', 'pow2', 'pow3', 'inv', 'sigmoid', 'tanh', 'relu', 'binary']

In [7]:
def g(f, a, b=None):
    """
    f: 一元或二元运算函数
    a: 第一个参数
    b: 如果f是一元运算函数，则b为空，否则代表二元运算的第二个参数
    """
    if b is None:
        return f(a)
    else:
        return f(a,b)

使用gen_individuals得到随机产生的种群数据和特征表达式

In [8]:
import pandas as pd
import numpy as np
import random

iris = pd.read_csv("http://image.cador.cn/data/iris.csv")
gen_out = gen_individuals(5,4,iris,[1,2,3,4])
for x in gen_out['df']:
    print("____________________________________________")
    print(x.head(2))

____________________________________________
    g1         g2         g3        g4
0  1.2 -46.499130 -60.126805  5.528692
1  1.2  17.889969 -60.479861  6.373917
____________________________________________
         g1        g2   g3        g4
0  0.020907  1.032495  0.0 -4.758767
1  0.020907  0.131816  0.0 -4.550616
____________________________________________
    g1        g2   g3        g4
0 -0.4  1.946099  6.5  6.019048
1 -1.4  1.946099  6.3  6.019048
____________________________________________
         g1        g2        g3   g4
0 -2.804965  1.202852 -0.000304  2.8
1 -2.375887  0.980971 -0.000000  2.8
____________________________________________
         g1         g2        g3        g4
0  0.894949  43.589623  0.809632  0.934779
1  0.729297  27.714623  4.277071  0.667053
