# 测试数据预处理+模型预测 

### 引入相关库文件

In [1]:
import numpy as np
import os
import pandas as pd  # 读取csv储存为字典
from collections import defaultdict
from sklearn import preprocessing  # 0-1编码
from keras.models import load_model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### 首先定义数据预处理部分的函数

####   读取数据

In [2]:
    def capture(original_path):
        
        """读取csv文件，返回字典

        :param original_path: 读取路径
        :return: 数据字典
        """
        files = {}
        for i in filenames:
            # 文件路径
            file_path = os.path.join(original_path, i)
            with open(file_path) as f:
                reader = pd.read_csv(f)
                v = reader.loc[:, 'DE_time'].values
                files.update({i: v})
        return files

#### 将测试集窗口化，因为是测试集，故而设置slice_rate为0，即只对数据进行加窗处理，而不进行划分。

In [3]:
def slice_enc(data,slice_rate=0): 
        """将数据切分为前面多少比例，后面多少比例.
        :param data: 单挑数据
        :param slice_rate: 验证集以及测试集所占的比例
        :return: 切分好的数据
        """
        length=1024
        enc=True
        enc_step=14
        keys = data.keys()
        Train_Samples = {}
        Test_Samples = {}
        for i in keys:
            number = len(data[i])//14
            slice_data = data[i]
            all_lenght = len(slice_data)
            end_index = int(all_lenght * (1 - slice_rate))
            samp_train = int(number * (1 - slice_rate))  # 700
            Train_sample = []
            Test_Sample = []
            if enc:
                enc_time = length // enc_step
                samp_step = 0  # 用来计数Train采样次数
                for j in range(samp_train):
#                     print("end_index"+str(end_index))
#                     print('length'+str(length))
                    random_start = np.random.randint(low=0, high=(end_index - 2 * length))
                    label = 0
                    for h in range(enc_time):
                        samp_step += 1
                        random_start += enc_step
                        sample = slice_data[random_start: random_start + length]
                        Train_sample.append(sample)
                        if samp_step == samp_train:
                            label = 1
                            break
                    if label:
                        break
            else:
                for j in range(samp_train):
                    random_start = np.random.randint(low=0, high=(end_index - length))
                    sample = slice_data[random_start:random_start + length]
                    Train_sample.append(sample)

            # 抓取测试数据
            for h in range(number - samp_train):
                random_start = np.random.randint(low=end_index, high=(all_lenght - length))
                sample = slice_data[random_start:random_start + length]
                Test_Sample.append(sample)
            Train_Samples[i] = Train_sample
            Test_Samples[i] = Test_Sample
        return Train_Samples, Test_Samples

#### 将数据进行归一化，加快处理速度，且可以使得Relu激活函数变得更有效

In [4]:
    def scalar_stand(Train_X, Test_X):
        # 用训练集标准差标准化训练集以及测试集
        scalar = preprocessing.StandardScaler().fit(Train_X)
        Train_X = scalar.transform(Train_X)
        Test_X = scalar.transform(Test_X)
        return Train_X, Test_X

#### 以下代码是在训练模型的时候用来打标签，此处是为了得到数据对应的文件名。

In [5]:
    # 仅抽样完成，打标签
    def add_labels(train_test):
        X = []
        Y = []
        label = 0
        for i in filenames:
            if 'N' in i:
                label = 0
            elif 'B' in i:
                label = 1
            elif 'IR' in i:
                label = 3
            elif 'OR' in i:
                label = 2
            x = train_test[i]
            X += x
            lenx = len(x)
            Y += [i] * lenx
        return X, Y

## 下面是需要根据本地环境进行修改的地方

In [6]:
params = { }

params['model'] = r'C:\Users\86132\Desktop\cwru.model' ## 此处改为模型的完整路径


original_path= r'C:\Users\86132\Desktop\移动网课设ipython\data' ## 此处需要改为存放142个测试文件的上级文件夹的完整路径


### 进行数据预处理

In [7]:
filenames = os.listdir(original_path)
data = capture(original_path)

# 获取数据
train, test = slice_enc(data)

# 为训练集制作标签，返回X，Y，其中Y是X所在的文件的文件名
Train_X, Train_Y = add_labels(train)

#获取对应的文件名，便于后续的文件格式输出
file_name = []
for i in range(len(Train_Y)):
    file_name.append(os.path.splitext(Train_Y[i])[0])
    
scalar = preprocessing.StandardScaler().fit(Train_X)
Train_X = scalar.transform(Train_X)

Train_X = Train_X.reshape(35950, 1024,1)

### 调用模型并得出预测结果

In [8]:
model_load = load_model(params['model'])
pre_x = model_load.predict(Train_X,verbose=0)
predict_label = np.argmax(pre_x, axis=1)
print("预测成功!")



预测成功!


### 存入csv文件中 （filename按照二进制排序，故输出文件最后是Test99.csv，100多的在中间）

In [9]:
df = pd.DataFrame()
df['label']=predict_label
df['filename']=file_name
df.to_csv("测试3.csv",index = 0)   
print("保存成功！")

保存成功！
