# 转换器 transformer
实例化是一个转换器类，就是在特征工程中所说的StandardScaler或者MinMaxScaler等，可以对数据进行输入和转换

# 估计器 estimator
实例化是一个估计器类，可以对训练集进行训练，并对新输入实例进行预测。

# 特征工程
## 1. 特征抽取

特征抽取API：
- sklearn.feature_extraction

一些常用语法：
- fit_transform(x) # 载入数据并进行转换
- fit()  # 载入数据但不转换
- transform()  # 将载入的数据进行转换
- inverse_transform(x)
- get_feature_names() 

### 1.1 文本向量

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
Count = CountVectorizer()
res = Count.fit_transform(['life is short, i like python', 'life is too long, i dislike python'])
Count.get_feature_names(), res.toarray()

(['dislike', 'is', 'life', 'like', 'long', 'python', 'short', 'too'],
 array([[0, 1, 1, 1, 0, 1, 1, 0],
        [1, 1, 1, 0, 1, 1, 0, 1]], dtype=int64))

注意，如果是中文，需要用jieba分词

### 1.2 将字典进行特征抽取

In [24]:
# 如果sparse是True
from sklearn.feature_extraction import DictVectorizer
Dict1 = DictVectorizer(sparse = True)
data = [{'city':'beijing','t':100},
        {'city':'shanghai','t':80},
        {'city':'chengdu','t':20}]
res1 = Dict1.fit_transform(data)
print(res1)
print(Dict1.get_feature_names())

  (0, 0)	1.0
  (0, 3)	100.0
  (1, 2)	1.0
  (1, 3)	80.0
  (2, 1)	1.0
  (2, 3)	20.0
['city=beijing', 'city=chengdu', 'city=shanghai', 't']


In [25]:
# 如果sparse是False，则是one-hot编码
Dict2 = DictVectorizer(sparse = False)
res2 = Dict2.fit_transform(data)
print(res2)
print(Dict2.get_feature_names())

[[  1.   0.   0. 100.]
 [  0.   0.   1.  80.]
 [  0.   1.   0.  20.]]
['city=beijing', 'city=chengdu', 'city=shanghai', 't']


### 1.3 tfidf

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
data3 = ['life is short, i love python',
        'life is too long, i dislike python']
res3 = tf.fit_transform(data3)
print(tf.get_feature_names())
print(res[0])
print(res.toarray()[0])

['dislike', 'is', 'life', 'long', 'love', 'python', 'short', 'too']
  (0, 13)	0.4451321885325025
  (0, 8)	0.4451321885325025
  (0, 14)	0.5645937386324167
  (0, 4)	0.4451321885325025
  (0, 6)	0.29462843463227384
[0.         0.         0.         0.         0.44513219 0.
 0.29462843 0.         0.44513219 0.         0.         0.
 0.         0.44513219 0.56459374 0.         0.         0.
 0.         0.         0.        ]


## 2. 特征处理
- 归一化
- 标准化
- 缺失值

特征处理的API：
归一化，标准化：
- sklearn.preprocessing

缺失值处理：
- sklearn.impute

### 2.1 最大最小归一化

In [49]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
data = [[17,23,34,43],[42,54,66,23],[74,87,39,19]]
res = mm.fit_transform(data)
res, res.mean(), res.std()

(array([[0.        , 0.        , 0.        , 1.        ],
        [0.43859649, 0.484375  , 1.        , 0.16666667],
        [1.        , 1.        , 0.15625   , 0.        ]]),
 0.437157346491228,
 0.4268000425575319)

但是最大最小归一化对异常值敏感

### 2.2 标准化

In [43]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
res = st.fit_transform(data)
res

array([[-1.17166771, -1.2117899 , -0.87747721,  1.3970014 ],
       [-0.10002041, -0.02551137,  1.39922042, -0.50800051],
       [ 1.27168813,  1.23730126, -0.52174321, -0.88900089]])

在已有样本足够多的情况下，相比于最大最小归一化而言，比较稳定

### 2.3 缺失值处理

In [61]:
from sklearn.impute import SimpleImputer
import numpy as np
im = SimpleImputer(missing_values = np.nan, strategy = 'mean')
data = [[1, 2],[np.nan, 3], [4, np.nan]]
res = im.fit_transform(data)
res

array([[1. , 2. ],
       [2.5, 3. ],
       [4. , 2.5]])

## 3. 特征降维

特征选择API：
- sklearn.feature_selection

主成分分析API
- sklearn.decomposition

### 3.1 按照方差进行特征选择

In [71]:
from sklearn.feature_selection import VarianceThreshold
var = VarianceThreshold(threshold = 6)
data = [[2,3,4,0],[1,2,6,3],[7,9,3,5]]
res = var.fit_transform(data)
res

array([[2, 3],
       [1, 2],
       [7, 9]])

### 3.2 主成分分析

In [84]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
data = [[2,8,4,5],[6,3,5,7],[8,9,3,5]]
res = pca.fit_transform(data)
res

array([[-1.6983001 ,  3.23250833],
       [ 4.06852232, -0.33733189],
       [-2.37022222, -2.89517644]])

### 3.3 线性判别分析

In [21]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 2)
lda_x = lda.fit_transform(iris.data, iris.target)
lda_x[0]

array([8.06179978, 0.30042062])

## 4. 压缩数据

In [None]:
data = data.query('x > 1 & y < 2')

## 5. 构造新特征
### 5.1 日期时间数据

In [None]:
# 将时间日期数据转换成字典格式
time_value = pd.to_datetime(data['time'], unit = 's')
time_value = pd.DateTimeIndex(time_value)

# 直接提取字典格式中的数据
data['day'] = time_value.day
data['hour'] = time_value.hour
data['year'] = time_value.year

# 6. 数据编码

In [12]:
from sklearn.preprocessing import LabelEncoder
y = ['a','b','a','c','c','d','b']
enc = LabelEncoder()
y_encodered = enc.fit_transform(y)
y_encodered

array([0, 1, 0, 2, 2, 3, 1], dtype=int64)

# 获取数据
获取数据API：
- from sklearn import datasets

数据可查看：
- data 
- target
- DESCR

In [1]:
from sklearn import datasets
li = datasets.load_iris()
li.data, li.target, li.DESCR

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

# 数据划分
数据划分API：
- sklearn.model_selection.train_test_split

In [2]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = \
train_test_split(li.data, li.target, test_size = 0.25)