# 第6课：特征工程

## 学习目标
- 理解特征工程的重要性
- 掌握特征选择方法
- 学会特征变换技术
- 了解特征创建方法

## 1. 特征工程简介

特征工程是将原始数据转换为更好表示问题本质的特征，从而提高模型性能的过程。

**"数据和特征决定了机器学习的上限，而模型和算法只是逼近这个上限"**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

## 2. 数值特征处理

In [None]:
# 创建示例数据
np.random.seed(42)
data = pd.DataFrame({
    'age': np.random.randint(18, 70, 100),
    'income': np.random.exponential(50000, 100),
    'score': np.random.normal(75, 10, 100)
})

print("原始数据统计:")
print(data.describe())

In [None]:
# 2.1 标准化（Z-Score）
scaler_standard = StandardScaler()
data_standard = scaler_standard.fit_transform(data)

# 2.2 归一化（Min-Max）
scaler_minmax = MinMaxScaler()
data_minmax = scaler_minmax.fit_transform(data)

# 2.3 鲁棒缩放（对异常值不敏感）
scaler_robust = RobustScaler()
data_robust = scaler_robust.fit_transform(data)

# 可视化对比
fig, axes = plt.subplots(1, 4, figsize=(16, 4))

axes[0].boxplot(data.values)
axes[0].set_xticklabels(data.columns)
axes[0].set_title('原始数据')

axes[1].boxplot(data_standard)
axes[1].set_xticklabels(data.columns)
axes[1].set_title('标准化')

axes[2].boxplot(data_minmax)
axes[2].set_xticklabels(data.columns)
axes[2].set_title('归一化')

axes[3].boxplot(data_robust)
axes[3].set_xticklabels(data.columns)
axes[3].set_title('鲁棒缩放')

plt.tight_layout()
plt.show()

In [None]:
# 2.4 对数变换（处理偏态分布）
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 原始 income 分布
axes[0].hist(data['income'], bins=30)
axes[0].set_title('原始 income 分布')

# 对数变换
data['income_log'] = np.log1p(data['income'])
axes[1].hist(data['income_log'], bins=30)
axes[1].set_title('对数变换后')

# Box-Cox 变换
pt = PowerTransformer(method='yeo-johnson')
data['income_boxcox'] = pt.fit_transform(data[['income']])
axes[2].hist(data['income_boxcox'], bins=30)
axes[2].set_title('Box-Cox 变换后')

plt.tight_layout()
plt.show()

In [None]:
# 2.5 分箱（Binning）
# 等宽分箱
data['age_bin_equal_width'] = pd.cut(data['age'], bins=5, labels=['很年轻', '年轻', '中年', '中老年', '老年'])

# 等频分箱
data['age_bin_equal_freq'] = pd.qcut(data['age'], q=5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])

# 自定义分箱
data['age_bin_custom'] = pd.cut(data['age'], bins=[0, 25, 35, 50, 100], 
                                 labels=['青年', '壮年', '中年', '老年'])

print("分箱结果:")
print(data[['age', 'age_bin_equal_width', 'age_bin_equal_freq', 'age_bin_custom']].head(10))

## 3. 分类特征编码

In [None]:
# 创建分类数据
cat_data = pd.DataFrame({
    'color': ['红', '蓝', '绿', '红', '蓝', '绿', '红', '蓝'],
    'size': ['小', '中', '大', '中', '小', '大', '大', '小'],
    'quality': ['低', '中', '高', '高', '中', '低', '高', '中']
})

print("原始分类数据:")
print(cat_data)

In [None]:
# 3.1 标签编码（Label Encoding）
le = LabelEncoder()
cat_data['color_label'] = le.fit_transform(cat_data['color'])
print("标签编码:")
print(cat_data[['color', 'color_label']])
print(f"编码映射: {dict(zip(le.classes_, range(len(le.classes_))))}")

In [None]:
# 3.2 独热编码（One-Hot Encoding）
one_hot = pd.get_dummies(cat_data['color'], prefix='color')
print("独热编码:")
print(pd.concat([cat_data['color'], one_hot], axis=1))

In [None]:
# 3.3 有序编码（Ordinal Encoding）
# 适用于有顺序关系的分类变量
size_order = ['小', '中', '大']
quality_order = ['低', '中', '高']

oe = OrdinalEncoder(categories=[size_order])
cat_data['size_ordinal'] = oe.fit_transform(cat_data[['size']])

# 手动映射
quality_map = {'低': 0, '中': 1, '高': 2}
cat_data['quality_ordinal'] = cat_data['quality'].map(quality_map)

print("有序编码:")
print(cat_data[['size', 'size_ordinal', 'quality', 'quality_ordinal']])

In [None]:
# 3.4 目标编码（Target Encoding）
# 用目标变量的均值替换分类变量
target_data = pd.DataFrame({
    'city': ['北京', '上海', '北京', '深圳', '上海', '北京', '深圳', '上海'],
    'target': [1, 0, 1, 0, 1, 1, 0, 1]
})

# 计算每个城市的目标均值
city_target_mean = target_data.groupby('city')['target'].mean()
target_data['city_target_encoded'] = target_data['city'].map(city_target_mean)

print("目标编码:")
print(target_data)
print(f"\n城市目标均值: {city_target_mean.to_dict()}")

## 4. 特征创建

In [None]:
# 4.1 多项式特征
X_simple = np.array([[1, 2], [3, 4], [5, 6]])
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_simple)

print("原始特征:")
print(X_simple)
print(f"\n多项式特征（degree=2）:")
print(X_poly)
print(f"\n特征名称: {poly.get_feature_names_out(['x1', 'x2'])}")

In [None]:
# 4.2 交互特征
df = pd.DataFrame({
    'price': [100, 200, 150, 300],
    'quantity': [10, 5, 8, 3]
})

# 创建交互特征
df['total_value'] = df['price'] * df['quantity']
df['price_per_unit'] = df['price'] / df['quantity']
df['price_quantity_ratio'] = df['price'] / (df['quantity'] + 1)

print("交互特征:")
print(df)

In [None]:
# 4.3 日期特征提取
date_df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=10, freq='D')
})

# 提取日期特征
date_df['year'] = date_df['date'].dt.year
date_df['month'] = date_df['date'].dt.month
date_df['day'] = date_df['date'].dt.day
date_df['dayofweek'] = date_df['date'].dt.dayofweek
date_df['is_weekend'] = date_df['dayofweek'].isin([5, 6]).astype(int)
date_df['quarter'] = date_df['date'].dt.quarter
date_df['dayofyear'] = date_df['date'].dt.dayofyear

print("日期特征:")
print(date_df)

In [None]:
# 4.4 文本特征
text_df = pd.DataFrame({
    'text': [
        'Hello World',
        'Machine Learning is awesome',
        'Python',
        'Data Science with Python and Machine Learning'
    ]
})

# 提取文本特征
text_df['char_count'] = text_df['text'].str.len()
text_df['word_count'] = text_df['text'].str.split().str.len()
text_df['avg_word_length'] = text_df['char_count'] / text_df['word_count']
text_df['has_python'] = text_df['text'].str.contains('Python', case=False).astype(int)

print("文本特征:")
print(text_df)

## 5. 特征选择

In [None]:
# 加载数据
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

print(f"特征数量: {X.shape[1]}")
print(f"样本数量: {X.shape[0]}")

In [None]:
# 5.1 基于统计的特征选择
# F-检验
selector_f = SelectKBest(f_classif, k=10)
X_f = selector_f.fit_transform(X, y)

# 获取选中的特征
f_scores = pd.DataFrame({
    'feature': X.columns,
    'f_score': selector_f.scores_,
    'selected': selector_f.get_support()
}).sort_values('f_score', ascending=False)

print("F-检验特征排名（前15）:")
print(f_scores.head(15))

In [None]:
# 5.2 基于互信息的特征选择
selector_mi = SelectKBest(mutual_info_classif, k=10)
X_mi = selector_mi.fit_transform(X, y)

mi_scores = pd.DataFrame({
    'feature': X.columns,
    'mi_score': selector_mi.scores_
}).sort_values('mi_score', ascending=False)

print("互信息特征排名（前15）:")
print(mi_scores.head(15))

In [None]:
# 5.3 递归特征消除（RFE）
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

rfe_ranking = pd.DataFrame({
    'feature': X.columns,
    'ranking': rfe.ranking_,
    'selected': rfe.support_
}).sort_values('ranking')

print("RFE 特征排名:")
print(rfe_ranking.head(15))

In [None]:
# 5.4 基于模型的特征选择
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# 特征重要性
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("随机森林特征重要性（前15）:")
print(importance_df.head(15))

# 可视化
plt.figure(figsize=(12, 8))
plt.barh(importance_df['feature'].head(15), importance_df['importance'].head(15))
plt.xlabel('重要性')
plt.title('随机森林特征重要性 Top 15')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# 5.5 基于相关性的特征选择
# 计算相关性矩阵
corr_matrix = X.corr().abs()

# 找出高度相关的特征对
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr = [(col, idx, upper.loc[idx, col]) 
             for col in upper.columns 
             for idx in upper.index 
             if upper.loc[idx, col] > 0.9]

print(f"高度相关的特征对（相关系数 > 0.9）:")
for f1, f2, corr in sorted(high_corr, key=lambda x: x[2], reverse=True)[:10]:
    print(f"  {f1} <-> {f2}: {corr:.3f}")

## 6. 特征工程流水线

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# 创建混合数据
mixed_data = pd.DataFrame({
    'age': [25, 30, np.nan, 45, 50],
    'income': [50000, 60000, 70000, np.nan, 90000],
    'gender': ['M', 'F', 'M', 'F', 'M'],
    'city': ['北京', '上海', '北京', '深圳', '上海'],
    'target': [0, 1, 1, 0, 1]
})

print("混合数据:")
print(mixed_data)

In [None]:
# 定义数值和分类特征
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']

# 数值特征处理流水线
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 分类特征处理流水线
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 组合处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 应用预处理
X_processed = preprocessor.fit_transform(mixed_data.drop('target', axis=1))

# 获取特征名称
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_names = numeric_features + list(cat_feature_names)

X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
print("处理后的数据:")
print(X_processed_df)

In [None]:
# 完整的机器学习流水线
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# 训练模型
X_train = mixed_data.drop('target', axis=1)
y_train = mixed_data['target']

full_pipeline.fit(X_train, y_train)
print(f"训练准确率: {full_pipeline.score(X_train, y_train):.3f}")

## 7. 练习题

### 练习：对 Titanic 数据进行特征工程

In [None]:
titanic = sns.load_dataset('titanic')
print(titanic.head())

# 在这里编写代码
# 1. 处理缺失值
# 2. 编码分类变量
# 3. 创建新特征（家庭规模、头衔等）
# 4. 特征选择
# 5. 构建预处理流水线


## 8. 本课小结

1. **数值特征**：标准化、归一化、对数变换、分箱
2. **分类特征**：标签编码、独热编码、目标编码
3. **特征创建**：多项式特征、交互特征、日期特征
4. **特征选择**：统计方法、RFE、基于模型、相关性
5. **Pipeline**：构建可复用的特征工程流水线