# Task3 特征工程

此部分为零基础入门数据挖掘-心跳信号分类预测的 Task3 特征工程部分，带你来了解时间序列特征工程以及分析方法，欢迎大家后续多多交流。

赛题：零基础入门数据挖掘-心跳信号分类预测

项目地址：
比赛地址：

## 3.1 学习目标

* 学习时间序列数据的特征预处理方法
* 学习时间序列特征处理工具 Tsfresh（TimeSeries Fresh）的使用

## 3.2 内容介绍
* 数据预处理
	* 时间序列数据格式处理
	* 加入时间步特征time
* 特征工程
	* 时间序列特征构造
	* 特征筛选
	* 使用 tsfresh 进行时间序列特征处理

## 3.3 代码示例

### 3.3.1 导入包并读取数据

In [1]:
# 包导入
import pandas as pd
import numpy as np
# import tsfresh as tsf
# from tsfresh import extract_features, select_features
# from tsfresh.utilities.dataframe_functions import impute
# # from tsfresh.examples.robot_execution_failures import download_robot_execution_failures,load_robot_execution_failures
# from tsfresh import extract_features, extract_relevant_features, select_features
# from tsfresh.utilities.dataframe_functions import impute
# from tsfresh.feature_extraction import ComprehensiveFCParameters

In [2]:
# 数据读取
data_train = pd.read_csv('./data/train.csv')
data_test_A = pd.read_csv('./data/testA.csv')

print(data_train.shape)
print(data_test_A.shape)

(100000, 3)
(20000, 2)


In [3]:
data_train.head(10)

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0
5,5,"1.0,0.8675497147252661,0.5128848334259041,0.36...",0.0
6,6,"0.9505940730409403,0.9166910623948625,0.848396...",2.0
7,7,"0.8681707532149519,0.8318642354644805,0.531120...",2.0
8,8,"0.9792414410794537,0.6155508397973931,0.632268...",3.0
9,9,"0.9917559671326545,1.0,0.9740518898684873,0.93...",2.0


In [4]:
# data_train['label']

In [5]:
# data_train = data_train.drop(['label'], axis=1)
# data_train

In [6]:
data_test_A.head()

Unnamed: 0,id,heartbeat_signals
0,100000,"0.9915713654170097,1.0,0.6318163407681274,0.13..."
1,100001,"0.6075533139615096,0.5417083883163654,0.340694..."
2,100002,"0.9752726292239277,0.6710965234906665,0.686758..."
3,100003,"0.9956348033996116,0.9170249621481004,0.521096..."
4,100004,"1.0,0.8879490481178918,0.745564725322326,0.531..."


In [8]:
### 3.3.2 数据预处理
# 对心电特征进行行转列处理，同时为每个心电信号加入时间步特征time
train_heartbeat_df = data_train['heartbeat_signals'].str.split(",", expand=True).stack()
train_heartbeat_df = train_heartbeat_df.reset_index()
train_heartbeat_df = train_heartbeat_df.set_index("level_0")
train_heartbeat_df.index.name = None
train_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)
train_heartbeat_df['heartbeat_signals'] = train_heartbeat_df['heartbeat_signals'].astype(float)

train_heartbeat_df.head(10)

Unnamed: 0,time,heartbeat_signals
0,0,0.99123
0,1,0.943533
0,2,0.764677
0,3,0.618571
0,4,0.379632
0,5,0.190822
0,6,0.040237
0,7,0.025995
0,8,0.031709
0,9,0.065524


In [9]:
# 将处理后的心电特征加入到训练数据中，同时将训练数据label列单独存储
# data_train_label = data_train['label']
# data_train = data_train.drop(['label'], axis=1)
# # data_train = data_train.drop('heartbeat_signals', axis=1)
# data_train = data_train.join(train_heartbeat_df)
# data_train
data_train_label = data_train['label']
data_train = data_train.drop(['label'], axis=1)
data_train = data_train.drop(['heartbeat_signals'], axis=1)
data_train = data_train.join(train_heartbeat_df)

data_train.head(10)

Unnamed: 0,id,time,heartbeat_signals
0,0,0,0.99123
0,0,1,0.943533
0,0,2,0.764677
0,0,3,0.618571
0,0,4,0.379632
0,0,5,0.190822
0,0,6,0.040237
0,0,7,0.025995
0,0,8,0.031709
0,0,9,0.065524


In [10]:
data_train[data_train["id"]==1]

Unnamed: 0,id,time,heartbeat_signals
1,1,0,0.971482
1,1,1,0.928969
1,1,2,0.572933
1,1,3,0.178457
1,1,4,0.122962
...,...,...,...
1,1,200,0.000000
1,1,201,0.000000
1,1,202,0.000000
1,1,203,0.000000


In [11]:
from tsfresh.feature_extraction import extract_features, MinimalFCParameters
train_features = extract_features(data_train, column_id='id', column_sort='time', default_fc_parameters=MinimalFCParameters())
train_features.head(10)

Feature Extraction: 100%|██████████████████████| 10/10 [01:41<00:00, 10.12s/it]


Unnamed: 0,heartbeat_signals__sum_values,heartbeat_signals__median,heartbeat_signals__mean,heartbeat_signals__length,heartbeat_signals__standard_deviation,heartbeat_signals__variance,heartbeat_signals__root_mean_square,heartbeat_signals__maximum,heartbeat_signals__minimum
0,38.927945,0.125531,0.189892,205.0,0.229783,0.0528,0.298093,1.0,0.0
1,19.445634,0.030481,0.094857,205.0,0.16908,0.028588,0.193871,1.0,0.0
2,21.192974,0.0,0.10338,205.0,0.184119,0.0339,0.211157,1.0,0.0
3,42.113066,0.241397,0.20543,205.0,0.186186,0.034665,0.277248,1.0,0.0
4,69.756786,0.0,0.340277,205.0,0.366213,0.134112,0.499901,0.999908,0.0
5,25.524279,0.041579,0.124509,205.0,0.175176,0.030687,0.214916,1.0,0.0
6,49.344826,0.326956,0.240706,205.0,0.222915,0.049691,0.328071,1.0,0.0
7,52.710158,0.336291,0.257123,205.0,0.239443,0.057333,0.351347,1.0,0.0
8,45.128485,0.267322,0.220139,205.0,0.199813,0.039925,0.297298,1.0,0.0
9,66.477343,0.340271,0.32428,205.0,0.175118,0.030666,0.368543,1.0,0.0


In [12]:
# 2. 特征选择 
# train_features中包含了heartbeat_signals的779种常见的时间序列特征（所有这些特征的解释可以去看官方文档），
# 这其中有的特征可能为NaN值（产生原因为当前数据不支持此类特征的计算），使用以下方式去除NaN值：
from tsfresh.utilities.dataframe_functions import impute
impute(train_features)

Unnamed: 0,heartbeat_signals__sum_values,heartbeat_signals__median,heartbeat_signals__mean,heartbeat_signals__length,heartbeat_signals__standard_deviation,heartbeat_signals__variance,heartbeat_signals__root_mean_square,heartbeat_signals__maximum,heartbeat_signals__minimum
0,38.927945,0.125531,0.189892,205.0,0.229783,0.052800,0.298093,1.000000,0.0
1,19.445634,0.030481,0.094857,205.0,0.169080,0.028588,0.193871,1.000000,0.0
2,21.192974,0.000000,0.103380,205.0,0.184119,0.033900,0.211157,1.000000,0.0
3,42.113066,0.241397,0.205430,205.0,0.186186,0.034665,0.277248,1.000000,0.0
4,69.756786,0.000000,0.340277,205.0,0.366213,0.134112,0.499901,0.999908,0.0
...,...,...,...,...,...,...,...,...,...
99995,63.323449,0.388402,0.308895,205.0,0.211636,0.044790,0.374441,1.000000,0.0
99996,69.657534,0.421138,0.339793,205.0,0.199966,0.039986,0.394266,1.000000,0.0
99997,40.897057,0.213306,0.199498,205.0,0.200657,0.040263,0.282954,1.000000,0.0
99998,42.333303,0.264974,0.206504,205.0,0.164380,0.027021,0.263941,1.000000,0.0


In [13]:
# 接下来，按照特征和响应变量之间的相关性进行特征选择，这一过程包含两步：首先单独计算每个特征和响应变量之间的相关性，
# 然后利用Benjamini-Yekutieli procedure [1] 进行特征选择，决定哪些特征可以被保留。
from tsfresh import select_features

# 按照特征和数据label之间的相关性进行特征选择
train_features_filtered = select_features(train_features, data_train_label)

train_features_filtered

Unnamed: 0,heartbeat_signals__sum_values,heartbeat_signals__median,heartbeat_signals__mean,heartbeat_signals__standard_deviation,heartbeat_signals__variance,heartbeat_signals__root_mean_square,heartbeat_signals__maximum,heartbeat_signals__minimum
0,38.927945,0.125531,0.189892,0.229783,0.052800,0.298093,1.000000,0.0
1,19.445634,0.030481,0.094857,0.169080,0.028588,0.193871,1.000000,0.0
2,21.192974,0.000000,0.103380,0.184119,0.033900,0.211157,1.000000,0.0
3,42.113066,0.241397,0.205430,0.186186,0.034665,0.277248,1.000000,0.0
4,69.756786,0.000000,0.340277,0.366213,0.134112,0.499901,0.999908,0.0
...,...,...,...,...,...,...,...,...
99995,63.323449,0.388402,0.308895,0.211636,0.044790,0.374441,1.000000,0.0
99996,69.657534,0.421138,0.339793,0.199966,0.039986,0.394266,1.000000,0.0
99997,40.897057,0.213306,0.199498,0.200657,0.040263,0.282954,1.000000,0.0
99998,42.333303,0.264974,0.206504,0.164380,0.027021,0.263941,1.000000,0.0
