In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import featuretools as ft

In [2]:
import sklearn
print(sklearn.__version__)
print(ft.__version__)

0.22.2.post1
0.13.4


# 单个数据表

In [3]:
dataset = load_iris()
X = dataset.data
y = dataset.target
iris_feature_names = dataset.feature_names

In [4]:
df = pd.DataFrame(X, columns=iris_feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
import featuretools as ft
es = ft.EntitySet(id='single_dataframe')           # 实体集命名为single_dataframe
# 支持的数据类型可以看https://docs.featuretools.com/en/stable/api_reference.html#variable-types
#variable_types = {col:ft.variable_types.Numeric for col in df.columns}
es.entity_from_dataframe(entity_id='iris',         # 增加一个数据框，命名为iris
             dataframe=df,
#              variable_types=variable_types,
             index='index',
             make_index=True)

Entityset: single_dataframe
  Entities:
    iris [Rows: 150, Columns: 5]
  Relationships:
    No relationships

## max_depth等于1

In [6]:
trans_primitives=['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric']  # 采用的特征基元，我们这里2列相加减乘除来生成
# ft.list_primitives()  # 查看可使用的特征集元
feature_matrix, feature_names = ft.dfs(entityset=es, 
     target_entity='iris', 
     max_depth=1,    # max_depth=1，只在原特征上进行操作，产生新特征
     verbose=1,
     trans_primitives=trans_primitives
)
# 不会同特征加减乘除，即没有a+a等情况
# 加和乘的新特征数+原始特征数，feature_num*(feature_num-1)/2+feature_num，所以这里是4*3/2+4=10
# 减和除的新特征数+原始特征数，feature_num*(feature_num-1)+feature_num，所以这里是4*3+4=16
# 实际上应该是10*2+16*2-4*3=40，4*3减去重复的3原始特征3次
# 这是因为0.13.4的featuretools默认减法满足交换律

Built 34 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [7]:
feature_names

[<Feature: sepal length (cm)>,
 <Feature: sepal width (cm)>,
 <Feature: petal length (cm)>,
 <Feature: petal width (cm)>,
 <Feature: petal length (cm) + sepal length (cm)>,
 <Feature: sepal length (cm) + sepal width (cm)>,
 <Feature: petal width (cm) + sepal width (cm)>,
 <Feature: petal length (cm) + petal width (cm)>,
 <Feature: petal length (cm) + sepal width (cm)>,
 <Feature: petal width (cm) + sepal length (cm)>,
 <Feature: petal length (cm) - sepal length (cm)>,
 <Feature: sepal length (cm) - sepal width (cm)>,
 <Feature: petal width (cm) - sepal width (cm)>,
 <Feature: petal length (cm) - petal width (cm)>,
 <Feature: petal length (cm) - sepal width (cm)>,
 <Feature: petal width (cm) - sepal length (cm)>,
 <Feature: petal length (cm) * sepal length (cm)>,
 <Feature: sepal length (cm) * sepal width (cm)>,
 <Feature: petal width (cm) * sepal width (cm)>,
 <Feature: petal length (cm) * petal width (cm)>,
 <Feature: petal length (cm) * sepal width (cm)>,
 <Feature: petal width (cm) 

In [8]:
# 设置commutative=False，让减法不满足交换律
trans_primitives=['add_numeric', ft.primitives.SubtractNumeric(commutative=False), 'multiply_numeric', 'divide_numeric']  # 采用的特征基元，我们这里2列相加减乘除来生成
feature_matrix, feature_names = ft.dfs(entityset=es, 
     target_entity='iris', 
     max_depth=1,    # max_depth=1，只在原特征上进行操作，产生新特征
     verbose=1,
     trans_primitives=trans_primitives
)

Built 40 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


生成的特征可能会出现 np.nan 或者 np.inf，表示 空值 或者 无穷大。如果原始特征中没有这样异常数据，前者可能由“0/0”造成，后者可能由“R/0”造成，R是实数

In [9]:
feature_matrix.replace([np.inf, -np.inf], np.nan)  # np.inf都用np.nan代替
feature_matrix.isnull().sum()                      # 查看可能存在的缺失值情况

sepal length (cm)                        0
sepal width (cm)                         0
petal length (cm)                        0
petal width (cm)                         0
petal length (cm) + sepal length (cm)    0
sepal length (cm) + sepal width (cm)     0
petal width (cm) + sepal width (cm)      0
petal length (cm) + petal width (cm)     0
petal length (cm) + sepal width (cm)     0
petal width (cm) + sepal length (cm)     0
petal length (cm) - sepal length (cm)    0
sepal length (cm) - sepal width (cm)     0
sepal length (cm) - petal width (cm)     0
sepal width (cm) - petal width (cm)      0
petal width (cm) - sepal width (cm)      0
petal length (cm) - sepal width (cm)     0
petal length (cm) - petal width (cm)     0
sepal width (cm) - sepal length (cm)     0
sepal width (cm) - petal length (cm)     0
petal width (cm) - sepal length (cm)     0
petal width (cm) - petal length (cm)     0
sepal length (cm) - petal length (cm)    0
petal length (cm) * sepal length (cm)    0
sepal lengt

## max_depth不为1
注意基元的顺序带来的影响，这里以2个加减两个基元说明

In [10]:
# 先乘再除
feat_matrix, feat_names = ft.dfs(entityset=es, 
                     target_entity='iris', 
                     max_depth=2, 
                     verbose=1,
                     trans_primitives=['multiply_numeric', 'divide_numeric'],
)
# 乘法基元处理后特征数（包含原特征）一共有4*3/2+4=10个
# 除法基元会在乘法处理后的10个特征上，进行除法操作，所以这样会有10*9+10=100个特征

Built 100 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [11]:
# 先除再乘
feat_matrix, feat_names = ft.dfs(entityset=es, 
                     target_entity='iris', 
                     max_depth=2, 
                     verbose=1,
                     trans_primitives=['divide_numeric', 'multiply_numeric']
)
# 除法基元处理后特征数（包含原特征）一共有4*3+4=16个
# 同样地，乘法在这16个特征上进行操作，会有16*15/2+16=136个特征

Built 136 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


# 多个数据表

In [12]:
df_1 = pd.DataFrame({'id':[0,1,2,3], 'a':[1,2,2,3], 'b':[2,4,4,5]})
df_2 = pd.DataFrame({'id':[0,1,1,2,3], 'c':[1,3,3,2,5], 'd':[5,6,7,9,8]})

es = ft.EntitySet(id='double_dataframe')
es.entity_from_dataframe(entity_id='df_1',         # 增加一个数据框
             dataframe=df_1,
             index='id')
es.entity_from_dataframe(entity_id='df_2',         # 增加一个数据框
             dataframe=df_2,
             index='index',
             make_index=True)
# 通过 id 关联 df_1 和 df_2 实体
relation = ft.Relationship(es['df_1']['id'], es['df_2']['id'])
es = es.add_relationship(relation)

In [13]:
trans_primitives=['add_numeric']
agg_primitives=['sum', 'median']
feature_matrix, feature_names = ft.dfs(entityset=es, 
                     target_entity='df_1', 
                     max_depth=1, 
                     verbose=1,
                     agg_primitives=agg_primitives,
                     trans_primitives=trans_primitives)

Built 7 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [14]:
feature_matrix

Unnamed: 0_level_0,a,b,SUM(df_2.c),SUM(df_2.d),MEDIAN(df_2.c),MEDIAN(df_2.d),a + b
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2,1,5,1,5.0,3
1,2,4,6,13,3,6.5,6
2,2,4,2,9,2,9.0,6
3,3,5,5,8,5,8.0,8


In [15]:
trans_primitives=['add_numeric']
agg_primitives=['sum', 'median']
feature_matrix, feature_names = ft.dfs(entityset=es, 
                     target_entity='df_1', 
                     max_depth=2, 
                     verbose=1,
                     agg_primitives=agg_primitives,
                     trans_primitives=trans_primitives)

Built 23 features
Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [16]:
feature_matrix

Unnamed: 0_level_0,a,b,SUM(df_2.c),SUM(df_2.d),MEDIAN(df_2.c),MEDIAN(df_2.d),a + b,SUM(df_2.c + d),MEDIAN(df_2.c + d),MEDIAN(df_2.c) + MEDIAN(df_2.d),...,MEDIAN(df_2.d) + SUM(df_2.c),a + SUM(df_2.d),a + SUM(df_2.c),b + MEDIAN(df_2.d),b + SUM(df_2.d),b + MEDIAN(df_2.c),b + SUM(df_2.c),a + MEDIAN(df_2.c),a + MEDIAN(df_2.d),SUM(df_2.c) + SUM(df_2.d)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,2,1,5,1,5.0,3,6,6.0,6.0,...,6.0,6,2,7.0,7,3,3,2,6.0,6
1,2,4,6,13,3,6.5,6,19,9.5,9.5,...,12.5,15,8,10.5,17,7,10,5,8.5,19
2,2,4,2,9,2,9.0,6,11,11.0,11.0,...,11.0,11,4,13.0,13,6,6,4,11.0,11
3,3,5,5,8,5,8.0,8,13,13.0,13.0,...,13.0,11,8,13.0,13,10,10,8,11.0,13
