In [1]:
import pandas as pd
import numpy as np
import os

def getCsvDf(readFunc=None):
    def read_csv(path):
        if readFunc:
            return readFunc(path)
        else:
            return pd.read_csv(path, index_col=0)
    
    class CsvDf:
        """
        创建一个存储于csv的表格， 封装了表格的读取， 删除， 存档， 并且支持对表格的指定行， 进行md5码存储后， 作为index， 方便后续查询
        """

        def __init__(self, fileName, path="."):
            self.path = path
            self.fileName = fileName
            self.df = None

        def get_df(self):
            if self.df is None:
                self.df = self.read()

            if self.df is None:
                self.df = pd.DataFrame()

            return self.df

        def read(self):
            if os.path.exists(os.path.join(self.path, self.fileName)):
#                 print("read from csv", os.path.join(self.path, self.fileName))
                return readFunc(os.path.join(self.path, self.fileName))
            else:
                return None

        def save(self, new_history):
#             print("保存", os.path.join(self.path, self.fileName))
            new_history.to_csv(os.path.join(self.path, self.fileName))
            self.df = new_history
            return self.df

        def clear(self):
            if os.path.exists(os.path.join(self.path, self.fileName)):
                os.remove(os.path.join(self.path, self.fileName))
            self.df = pd.DataFrame()
        
    return CsvDf

In [2]:
#KDense
from keras.layers import Dense, Activation, Input, BatchNormalization
from keras.models import Model


def direct_pip(x):
    return x


act_table = {
    0: None,
    1: 'relu',
    2: 'sigmoid',
    3: 'tanh',
    4: 'softmax'
}

batch_norm_table = {
    0: None,
    1: BatchNormalization
}

def make_dense(inputs, layer, act, norm, _debug_layer=False):
    if _debug_layer:
        print("make layer from {} {} {}".format(layer, act, norm))
    layer = int(layer)
    act = int(act)
    norm = int(norm)
    
    if layer <= 0:
        return inputs
    dense = Dense(layer)(inputs)
    if _debug_layer:
        print(dense)
    norm = get_norm(norm)(dense)
    if _debug_layer:
        print(norm)
    act = get_act(act)(norm)
    if _debug_layer:
        print(act)
    return act


def get_norm(code):
    norm = batch_norm_table[code]
    if norm is None:
        return direct_pip
    else:
        return norm()


def get_act(code):
    act = act_table[code]
    if act is None:
        return direct_pip
    else:
        return Activation(act)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from DenseDNAs import DenseDNAs

In [4]:
import numpy as np
import pandas as pd

from keras.models import Model #泛型模型  
from keras.layers import Dense, Input  
import matplotlib.pyplot as plt  

from DNA import IDNA

inputs = Input(shape=(1,))

x = np.linspace(0, 1, 256)
y = x ** 2 + x * 0.11


In [5]:
# %load NNDNAs.py
from DNA import IDNA, match
import numpy as np
import pandas as pd

from DenseDNAs import DenseDNAs

def _read_rna_history(path):
    pf = pd.read_csv(path, index_col=0, header=[0, 1])
    return pf.astype(np.float64) #转换成64是为了后续和nan合并的时候， 数据类型一致

def NNDNAs(model_maker, train_model, fitness_params):
    class _NNDNAs(IDNA):
        """
        构成普通的NN， 或者DNN的网络结构
        其核心提供了对DNA进行操作的方法， 和一些基因算法中的超参数的设置和修改
        """
        def __init__(self, target_name, max_layer_n,n_mean_stds=None, denseDNAs=None):
            self.denseDNAs = denseDNAs if denseDNAs else [DenseDNAs(model_maker)() for _ in range(max_layer_n)]
            self.max_layer_n = max_layer_n
            self.n_mean_stds = n_mean_stds if n_mean_stds else [(10, 2) for _ in range(self.max_layer_n)]
            self.target_name = target_name
            self.rna_csv = getCsvDf(_read_rna_history)("{}_rna_fitness.csv".format(target_name))

        def create(self, max_pop_n=20):
            """
            :param n_mean_stds: 每一层的初始化的（均值， 方差)
            :return:返回所有的DNA的pandas列表， 每一行一个DNA， 对应一个层
            """
            dnas_list = [self.denseDNAs[i].create(num=max_pop_n, n_mean_std=self.n_mean_stds[i]) for i in
                         range(self.max_layer_n)]

            return pd.concat(dnas_list, axis=1, keys=self.get_dna_columns())

        def evolve(self, dnas, fitness):
            """
            优胜劣汰， 变异， 包括了有性生殖"
            :param dnas:  重要的是， dnas是一个下层dnas的列表， 列表的元素是某个结构的dnas
            :param fitness:
            :return:
            """
            # fitness转换成排名计分
            fitness = fitness.value #TODO: 目前只是使用一个评判标准， 就是原始的-1 * loss
            fitness = fitness.rank()
            fitness += 1  # 避免排位很低的， 一点机会都没有
            #TODO 现在排位的方式， 不利于在复杂的情况下寻找最优解， 因为当有20个结果差不多的fitness出现的时候，
            # 他们结构的差异化较大， 理想结果是， 让他们都产生变异， 而不是只让头部的几个结果生存下来。
            dnas['fitness'] = fitness

            # 优胜劣汰
            surviors = dnas.sample(frac=0.2, weights=dnas.fitness)

            # 有性生殖
            father = dnas.sample(frac=0.3, weights=dnas.fitness)
            mother = dnas.sample(frac=0.4, weights=dnas.fitness)
            father, mother = match(father, mother, childsize=int(len(fitness) * 0.8), father_weight=father.fitness,
                                   mother_weight=mother.fitness)

            children = self.sex_propagation(father, mother)
            dnas = surviors.append(children).reset_index(drop=True)
            
            #变异
            dnas = self.mutate(dnas)
            
            #保存到磁盘
            self.save_dnas(dnas)
            return dnas
        
        def save_dnas(self, dnas):
            dnas.to_csv("{}_dnas.csv".format(self.target_name))

        def load_dnas(self):
            return pd.read_csv("{}_dnas.csv".format(self.target_name), index_col=0,  header=[0, 1])
        
        def mutate(self, dnas):
            """
            变异
            :param dnas:
            :return: 变异后的dnas列表
            """
            dnas = [self.denseDNAs[i].mutate(dnas.loc[:, "layer{}".format(i)].copy()) for i in range(self.max_layer_n)]
            return pd.concat(dnas, axis=1, keys=self.get_dna_columns())

        def to_RNA(self, dna):
            def nans(shape, dtype=float):
                a = np.empty(shape, dtype)
                a.fill(np.nan)
                return a

            """翻译成RNA， RNA是可以表达， 并且也可以被保存的"""
            rna = [self.denseDNAs[i].to_RNA(dna["layer{}".format(i)]) for i in range(self.max_layer_n)]
            rna = pd.concat(rna, axis=1, keys=self.get_dna_columns())

            # 压缩rna, 避免因为0 10 10和10 0 10 而出现的两次重复的计算, 实际压缩就是把类似 [nan, 1, nan, 2, 3] 变成 [1, 2, 3, nan, nan]
            _nans = nans(rna.shape[1])
            rna = rna.apply(lambda row: np.hstack([row.dropna().values, _nans])[:rna.shape[1]], axis=1)
            return rna

        def to_model(self, rna_row, inputs):
            model = inputs
            for i in range(self.max_layer_n):
                model = self.denseDNAs[i].to_dense(rna_row["layer{}".format(i)], model)
            return model
        
        def do_cal_fitness(self, row, inputs):
            row = row.fillna(0)
            model = self.to_model(row, inputs)
            fitness = train_model(model)
            for key in fitness:
#                 print(key, fitness[key])
                row["fitness", key] = fitness[key]
            return row
    
        def update_history(self, update_method):
            rna_fitness_history = self.rna_csv.get_df()
            rna_fitness_history = update_method(rna_fitness_history)
            return self.rna_csv.save(rna_fitness_history)
        
        def get_fitness(self, df_rna, inputs):
            """
            如果有历史数据， 就从历史数据中获取， 如果没有， 就计算结果， 并把结果合并到历史数据中
            """
            df_rna = pd.concat([df_rna, pd.DataFrame(index=df_rna.index,
                                                     columns=pd.MultiIndex.from_product([["fitness"], fitness_params]))], axis=1)
            df_rna = df_rna.astype(np.float64)
            
            # 获取历史
            rna_fitness_history = self.rna_csv.get_df()
            if rna_fitness_history.empty:
                rna_fitness_history = pd.DataFrame(columns=df_rna.columns)
            
            # 如果有历史数据， 就从历史数据中merge结果， 如果没有， 再计算
            df_rna = pd.merge(df_rna.iloc[:, 0:-len(df_rna.fitness.columns)], 
                              rna_fitness_history, how='left', on=df_rna.columns[0:-len(df_rna.fitness.columns)].tolist())
            # 显式的指定df_rna值的类型， 不然的话， 和带有nan的组合并的时候， 结果会失败
            df_rna = df_rna.astype(np.float64)
            
            # 需要计算的数据
            df_rna_cal = df_rna[np.isnan(df_rna.fitness.value)].drop_duplicates()
            
            # 一行一行的计算结果
            # 应该有个单独保存计算结果的地方， 因为这个可能也是非常耗费时间的， 一次计算非常耗费时间
            def save_row(row):
                rna_fitness_history = rna_fitness_history.append(row, ignore_index=True)
                rna_fitness_history = self.rna_csv.save(rna_fitness_history)
            
            df_rna_result = pd.DataFrame(columns=df_rna_cal.columns)
            for _, row in df_rna_cal.iterrows():
                row = self.do_cal_fitness(row, inputs)
                df_rna_result = df_rna_result.append(row)
                rna_fitness_history = rna_fitness_history.append(row, ignore_index=True)
                rna_fitness_history = self.rna_csv.save(rna_fitness_history)
                
            df_rna_cal = df_rna_result
#             print("get_fitness>df_rna_cal------------------\n", df_rna_cal)
            # 把结果数据合并回到df_rna
#             print("get_fitness>df_rna------------------\n", df_rna)
            df_rna = pd.merge(df_rna,
                              df_rna_cal,
                              how='left',
                              on=df_rna.columns[0:-len(df_rna.fitness.columns)].tolist()
                             ).fillna(0)
#             print("get_fitness>df_rna------------------\n", df_rna)
            merge_fitness = df_rna.fitness_x + df_rna.fitness_y
#             print("get_fitness>merge_fitness------------------\n", merge_fitness)
            merge_fitness = pd.DataFrame(merge_fitness.values, 
                                         index=df_rna.index, 
                                         columns=pd.MultiIndex.from_product([["fitness"], merge_fitness.columns.tolist()])
                                        )
#             print("get_fitness>merge_fitness------------------\n", merge_fitness)
            df_rna = pd.concat([df_rna.iloc[:, :-2*len(rna_fitness_history.fitness.columns)], merge_fitness],
                               axis=1)
#             print("get_fitness>df_rna------------------\n", df_rna)
            return df_rna.fitness

        def sex_propagation(self, father, mother):
            # 用denseDNA对每个Dense进行sex_propagation
            children = [
                self.denseDNAs[i].sex_propagation(father.loc[:, "layer{}".format(i)],
                                                  mother.loc[:, "layer{}".format(i)])
                for i in range(self.max_layer_n)]
            return pd.concat(children, axis=1, keys=self.get_dna_columns())

        def get_dna_columns(self):
            return ["layer{}".format(i) for i in range(self.max_layer_n)]

    return _NNDNAs

# if __name__ == '__main__':
#     nnDNAs = NNDNAs(print)(5)
#     dnas = nnDNAs.create()
#     print(dnas)
#     # dnas = nnDNAs.evolve(dnas, fitness)
#     for i in range(5):
#         dnas = nnDNAs.evolve(dnas, nnDNAs.get_fitness(nnDNAs.to_RNA(dnas)))


In [None]:
def train_model(model):
    ouput = Dense(1)(model)
    model = Model(inputs=inputs, outputs=ouput) #暂时使用这个来计算结果
    model.compile(loss='mse', optimizer='adam') # TODO: 这个应该根据最后一个激活函数的类型来进行选择， loss
    params = model.count_params()
    cost = cal_net_cost(params)
    train_history = model.fit(x, y, epochs=50, batch_size=64, verbose=0)
    #这个取值也过于粗糙，其实应该考虑收敛速度， 收敛的平滑程度等等的， 还包括了， 可能出现的初始参数导致不收敛的情况
    loss = train_history.history['loss'] # loss > 0, 归一化处理后， 一般小于1
    min_loss = np.min(np.array(loss))
    #映射到0~1之间， 越小， fitness越逼近1， 变化较大； 越大越逼近0， 变化减小
    fitness = -2 / (1 + (np.e ** (-min_loss))) + 2
    fitness = fitness * (1 - 0.1 * cost) # 使用fitness的10之一作为cost的代价， 当loss差不多的时候， cost就会发挥作用
    return {"value": fitness, "loss": min_loss, "params": params}

def _cal_fitness(loss, params):
#     print("_cal_fitness", loss, params)
    cost = cal_net_cost(params)
    fitness = -2 / (1 + (np.e ** (-loss))) + 2
    fitness = fitness - 0.1 * cost
#     print("_cal_fitness", fitness)
    return fitness

def cal_rna_fitness(rna_history):
    # 或许， 应该用矩阵的方式来操作， 但是这样， 计算fitness的代码就变成了两块， 考虑到不是经常调整参数， 
    # 就暂时先不弄这个, 或者values就该最后一起计算， 而不是分开计算
    _rna_h = rna_history.copy()
    values = _rna_h.fitness.apply(lambda fitness: _cal_fitness(fitness.loss, fitness.params), axis=1)
    _rna_h["fitness", "value"] = values
    return _rna_h

def cal_net_cost(params):
    """
    计算网络的复杂程度， 如果网络越复杂， 就应该给予一些惩罚， 而越简单的网络，得到更好的奖励
    """
    # TODO: 现在是最简单的方式, 先简单的计算参数的多少， 而且只是实用与DNN, 有个好处是， 结果可以复现， 而不会因为机器当时的训练数据收到变化
    # 映射到 0~1之间， 1000为超参数， 当params， 位于0~4k的时候， 值变化较线性， 超过4k，变化会减缓
    cost =  2 / (1 + (np.e ** (-params / 1000))) - 1
    return cost


# 这个score不能和fitness聪明， 不然， fitness.fitness合并的时候， 下层的fitness也会被加上_x, _y
# 这个算是pandas的bug吧
nnDNAs = NNDNAs(make_dense, train_model, ["value", "loss", "params"])("linear_test", 3)
dnas = nnDNAs.create(10)
# dnas = nnDNAs.load_dnas()
nnDNAs.update_history(cal_rna_fitness)
print("update over")
for i in range(20):
    fitness = nnDNAs.get_fitness(nnDNAs.to_RNA(dnas), inputs)
    #输出必要的fitness信息， 以供参考
    print("{}th: max={} top 3 mean:{}".format(i, fitness.value.max(), np.mean(fitness.value.sort_values(ascending=False)[:3])))
    dnas = nnDNAs.evolve(dnas, fitness)


update over
0th: max=0.8999877547562731 top 3 mean:0.8999792794219131
1th: max=0.8999732397783113 top 3 mean:0.8999417420591126
2th: max=0.9025602396177899 top 3 mean:0.9008483528569565
3th: max=0.9231218897475812 top 3 mean:0.913090612522272
4th: max=0.9592016492891315 top 3 mean:0.9388815559447409
5th: max=0.906757604458172 top 3 mean:0.9041358752360441
6th: max=0.9176515157731023 top 3 mean:0.9110941469875442
7th: max=0.954031674220027 top 3 mean:0.9262762766874143
8th: max=0.9497629608614921 top 3 mean:0.6157095734349377
9th: max=0.9004457773400815 top 3 mean:0.6001490485370141
10th: max=0.8999853650867135 top 3 mean:0.8997492569330046


In [None]:
cat ./linear_test_rna_fitness.csv

In [None]:
1/0

In [None]:
!rm -f ./linear_test_rna_fitness.csv

In [None]:
import numpy as np
import pandas as pd

from keras.datasets import mnist  
from keras.models import Model #泛型模型  
from keras.layers import Dense, Input  
import matplotlib.pyplot as plt

#制造数据
x = np.linspace(0, 1, 256)
y = x ** 2 + x * 0.11

#简单的网络
inputs = Input(shape=(1,))
layer = Dense(32)(inputs)
# layer = Dense(32, activation='relu')(layer)
# layer = Dense(32, activation='relu')(layer)
layer = Dense(1, activation='relu')(layer)

model = Model(inputs=inputs, outputs=layer)
model.compile(loss='mse', optimizer='adam')


In [None]:
train_history = model.fit(x, y, batch_size=32, epochs=10, shuffle=True)

In [None]:
predict = model.predict(x)

plt.plot(x, y)
plt.plot(x, predict)
plt.show()

In [None]:
plt.plot(train_history.history['loss'])
plt.show()
# 我们想法把结果

In [None]:
model.count_params()

In [None]:
x

# 优化代码， 让代码更具有普适性
# 优化rank的算法， 避免一个网络通吃的情况
# 加上网络结构和网络速度的loss计算
# 还有很多事情要做， 首先第一步就是， 随机搜寻数据， 当数据的计算速度较慢的时候， 整个算法的寻址速度太慢了， 是不是应该引入更多的规则或者什么trick
# 需要做的事情
1. 随机搜寻最优解， 当计算一次很慢的时候， 是否应该引入更多的trick，或者人工干预
2. 加入更多的监控， 能让我们知道当前网络的状态
3. 加入更多的人工干预， 并且及时生效的机制， 让我们知道， 网络是否可用
4. 针对已有的网络， 对网络中已经比较好的结果， 进行细调整的方式， 比如我固住前后几层的参数， 看是否能够， 把其中部分参数给去掉， 类似于减枝的操作， 让我们的网络更加快速