In [163]:
"""
BP算法在小麦种子分类中的应用
"""

from random import seed
from random import randrange
from random import random
from math import exp
import pandas as pd
import numpy as np

In [164]:
# #预处理
# def data_fix(filename):
#     dataset = []
#     with open(filename, 'r') as file:
#         csv_reader = reader(file)
#         for row in csv_reader:
#             print(row)
#             if not row:  # 判定是否有空行，如有，则跳入到下一行
#                 continue
#             dataset.append(row)
#     print(dataset)
#     #print(type(csv_reader))
#     from sklearn.preprocessing import LabelEncoder
#     labelencoder = LabelEncoder()
#     dataset[:, -1] = labelencoder.fit_transform(dataset[:, -1])
#     dataset.Class.value_counts()
#     print(csv_reader)

In [165]:
# data_fix('./gandou.csv')

In [166]:

# 数据读取
class Database():
    def __init__(self, db_file):
        self.filename = db_file
        self.dataset = list()
        
    # 导入文件
    def load_xls(self):
        # with open(filename, 'r') as file:
        #     csv_reader = reader(file)
        #     for row in csv_reader:
        #         if not row:  # 判定是否有空行，如有，则跳入到下一行
        #             continue
        #         self.dataset.append(row)
        # print(self.dataset[0])
        self.dataset = pd.read_excel(io = self.filename,sheet_name= 'Pistachio_28_Features_Dataset')

    # 将n-1列的属性字符串列转换为浮点数，第n列为分类的类别
    def dataset_to_float(self):
        # col_len = len(self.dataset[0]) - 1
        # for row in self.dataset:
        #     for column in range(col_len):
        #         row[column] = float(row[column].strip())
        Pistachio_columns = self.dataset.columns
        for i in range(len(self.dataset.columns)-1):
            self.dataset= self.dataset.astype({'%s'%Pistachio_columns[i] : 'float64'})
        

    # 将最后一列（n）的类别，转换为整型，并提取有多少个类（最后一列就是实际类型）
    def str_class_to_int(self):
        # class_values = [row[column] for row in self.dataset]  # 读取指定列的数字
        # unique = set(class_values)  # 用集合来合并类
        # lookup = dict()
        # for i, value in enumerate(unique):
        #     lookup[value] = i
        # for row in self.dataset:
        #     row[column] = lookup[row[column]]
        Class_list = []
        for i in self.dataset.index:
            if(self.dataset['Class'][i] not in Class_list):
                Class_list.append(self.dataset['Class'][i])
        for i in self.dataset.index:
            for j in range(len(Class_list)):
                if self.dataset.loc[i , 'Class'] == Class_list[j]:
                    self.dataset.loc[i , 'Class'] = int(j)
        self.dataset= self.dataset.astype({'Class' : 'int'})

    # 找到每一列（属性）的最小和最大值
    def dataset_minmax(self):
        self.minmax = list()
        self.minmax = [[min(column), max(column)] for column in zip(*self.dataset)]

    # 将数据集合中的每个（列）属性都规整化到0-1
    def normalize_dataset(self):
        # self.dataset_minmax()
        # for row in self.dataset:
        #     for i in range(len(row)-1):
        #         row[i] = (row[i] - self.minmax[i][0]) / (self.minmax[i][1] - self.minmax[i][0])
        self.dataset = (self.dataset-self.dataset.min())/(self.dataset.max()-self.dataset.min())

    def get_dataset(self):
        # 构建训练数据
        self.load_xls()
        self.dataset_to_float()
        self.str_class_to_int()
        self.normalize_dataset()
        self.dataset = self.dataset.values.tolist()
        print(self.dataset[0])
        return self.dataset

In [167]:

# BP网络训练
class BP_Network():
    # 初始化神经网络
    def __init__(self, n_inputs, n_hidden, n_outputs):
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_outputs = n_outputs
        self.network = list()
        hidden_layer = [{'weights': [random() for i in range(self.n_inputs + 1)]} for i in range(self.n_hidden)]
        self.network.append(hidden_layer)
        output_layer = [{'weights': [random() for i in range(self.n_hidden + 1)]} for i in range(self.n_outputs)]
        self.network.append(output_layer)

    # 计算神经元的激活值（加权之和）
    def activate(self, weights, inputs):
        activation = weights[-1]
        for i in range(len(weights)-1):
            activation += weights[i] * inputs[i]
        return activation

    # 定义激活函数
    def transfer(self, activation):
        return 1.0 / (1.0 + exp(-activation))

    # 计算神经网络的正向传播
    def forward_propagate(self, row):
        inputs = row
        for layer in self.network:
            new_inputs = []
            for neuron in layer:
                activation = self.activate(neuron['weights'], inputs)
                neuron['output'] = self.transfer(activation)
                new_inputs.append(neuron['output'])
            inputs = new_inputs
        return inputs

    # 计算激活函数的导数
    def transfer_derivative(self, output):
        return output * (1.0 - output)

    # 反向传播误差信息，并将纠偏责任存储在神经元中
    def backward_propagate_error(self, expected):
        for i in reversed(range(len(self.network))):
            layer = self.network[i]
            errors = list()
            if i != len(self.network)-1:
                for j in range(len(layer)):
                    error = 0.0
                    for neuron in self.network[i + 1]:
                        error += (neuron['weights'][j] * neuron['responsibility'])
                    errors.append(error)
            else:
                for j in range(len(layer)):
                    neuron = layer[j]
                    errors.append(expected[j] - neuron['output'])
            for j in range(len(layer)):
                neuron = layer[j]
                neuron['responsibility'] = errors[j] * self.transfer_derivative(neuron['output'])

    # 根据误差，更新网络权重
    def _update_weights(self, row):
        for i in range(len(self.network)):
            inputs = row[:-1]
            if i != 0:
                inputs = [neuron['output'] for neuron in self.network[i - 1]]
            for neuron in self.network[i]:
                for j in range(len(inputs)):
                    neuron['weights'][j] += self.l_rate * neuron['responsibility'] * inputs[j]
                neuron['weights'][-1] += self.l_rate * neuron['responsibility']

    # 根据指定的训练周期训练网络
    def train_network(self, train):
        for epoch in range(self.n_epoch):
            sum_error = 0
            for row in train:
                outputs = self.forward_propagate(row)
                expected = [0 for i in range(self.n_outputs)]
                
                expected[int(row[-1])] = 1
                sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
                self.backward_propagate_error(expected)
                self._update_weights(row)
            print('>周期=%d, 误差=%.3f' % (epoch, sum_error))

    # 利用训练好的网络，预测“新”数据
    def predict(self, row):
        outputs = self.forward_propagate(row)
        return outputs.index(max(outputs))

    # 利用随机梯度递减策略，训练网络
    def back_propagation(self, train, test):
        self.train_network(train)
        predictions = list()
        for row in test:
            prediction = self.predict(row)
            predictions.append(prediction)
        return(predictions)

    # 将数据库分割为 n_folds等份
    def cross_validation_split(self, n_folds):
        dataset_split = list()  # 含有划分成n_folds等份的列表的列表
        dataset_copy = list(self.dataset)
        # dataset_copy = self.dataset.values.tolist()
        fold_size = int(len(self.dataset) / n_folds)
        # print(dataset_copy)
        for i in range(n_folds):
            fold = list()
            while len(fold) < fold_size:
                # 输出dataset_copy长度内的随机数
                index = randrange(len(dataset_copy))
                # print('index',index)
                fold.append(dataset_copy.pop(index))
            dataset_split.append(fold)
        return dataset_split

    # 用预测正确百分比来衡量正确率
    def accuracy_metric(self, actual, predicted):
        correct = 0
        for i in range(len(actual)):
            if actual[i] == predicted[i]:
                correct += 1
        return correct / float(len(actual)) * 100.0

    # 用每一个交叉分割的块（训练集合，试集合）来评估BP算法
    def evaluate_algorithm(self, dataset, n_folds, l_rate, n_epoch):
        self.l_rate = l_rate
        self.n_epoch = n_epoch
        self.dataset = dataset
        folds = self.cross_validation_split(n_folds)
        scores = list()
        for fold in folds:  # 数据一等份一等份来
            train_set = list(folds)
            train_set.remove(fold)  # 移出这等份
            # sum函数的本意是求和，但在一些特殊的场景下，它还可以完成连接可迭代对象的功能 。
            # 完成数据的合并
            train_set = sum(train_set, [])  # 剩下的几等份合并
            test_set = list()
            for row in fold:  # 开始的一等份生成测试集合，并把最后的实际结果设置为None
                row_copy = list(row)
                test_set.append(row_copy)  # 是添加的本身
                row_copy[-1] = None  # 改变了本身的最后一个元素
            predicted = self.back_propagation(train_set, test_set)  # 通过训练集合训练后的神经网络，再通过测试集合得到预测值
            actual = [row[-1] for row in fold]  # 这是真实值
            accuracy = self.accuracy_metric(actual, predicted)  # 通过真实值和预测值评估神经网络
            scores.append(accuracy)
        return scores

In [168]:
if __name__ == '__main__':
    # 设置随机种子
    seed(1)
    # 构建训练数据
    filename = './Pistachio_Dataset/Pistachio_28_Features_Dataset/Pistachio_28_Features_Dataset.xls'
    #filename = './housing.csv'
    db = Database(filename)
    dataset = db.get_dataset()
    # 设置网络初始化参数
    n_inputs = len(dataset[0]) - 1
    n_hidden = 8  #15
    n_outputs = len(set([row[-1] for row in dataset]))
    BP = BP_Network(n_inputs, n_hidden, n_outputs)
    l_rate = 0.1 
    n_folds = 8 #8
    n_epoch = 2000
    scores = BP.evaluate_algorithm(dataset, n_folds, l_rate, n_epoch)
    print('评估算法正交验证得分: %s' % scores)
    print('平均准确率: %.3f%%' % (sum(scores)/float(len(scores))))

[0.35650743099787685, 0.37435925744381204, 0.315831579593325, 0.41371302394639664, 0.6579007027884834, 0.44081731398256657, 0.6841070989928766, 0.3725817881810393, 0.5396744659206509, 0.2543973434338193, 0.29972439136426277, 0.626524010947997, 0.2417582417582417, 0.44827586206896564, 0.5571691176470588, 0.6682514527205493, 0.3887740791184048, 0.22654734314713662, 0.21004930106752853, 0.3514821626324949, 0.352162816932665, 0.3148687399990494, 0.6293652883855676, 0.5929056642320853, 0.7398052705501464, 0.18079189839794138, 0.15840198443402664, 0.14300697205186091, 0.0]
>周期=0, 误差=1017.418
>周期=1, 误差=941.876
>周期=2, 误差=941.806
>周期=3, 误差=941.707
>周期=4, 误差=941.557
>周期=5, 误差=941.301
>周期=6, 误差=940.756
>周期=7, 误差=938.808
>周期=8, 误差=894.811
>周期=9, 误差=648.315
>周期=10, 误差=449.677
>周期=11, 误差=387.331
>周期=12, 误差=362.988
>周期=13, 误差=350.027
>周期=14, 误差=341.652
>周期=15, 误差=335.516
>周期=16, 误差=330.626
>周期=17, 误差=326.507
>周期=18, 误差=322.907
>周期=19, 误差=319.679
>周期=20, 误差=316.737
>周期=21, 误差=314.021
>周期=22, 误差=311.49