# 1.数据预处理

## 1.1缺失值检测

In [1]:
import pandas as pd
import numpy as np

# 构造示例数据
data = {
    "Age": [25, 30, np.nan, 40, 35],
    "Salary": [5000, np.nan, 7000, 8000, 7500],
    "Department": ["IT", "HR", "Finance", None, "IT"]
}
df = pd.DataFrame(data)

# 检查缺失情况
print(df.isnull())
print(df.isnull().sum())

     Age  Salary  Department
0  False   False       False
1  False    True       False
2   True   False       False
3  False   False        True
4  False   False       False
Age           1
Salary        1
Department    1
dtype: int64


### 删除法

In [5]:
# 删除含有缺失值的行
df_drop_row = df.dropna()

# 删除含有缺失值的列
df_drop_col = df.dropna(axis=1)
# 检查缺失情况
print(df_drop_row)
print("--------------------")
print(df_drop_col)

    Age  Salary Department
0  25.0  5000.0         IT
4  35.0  7500.0         IT
--------------------
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


### 简单填充法

In [8]:
# 用均值填充 Age
df['Age'].fillna(df['Age'].mean(), inplace=True)

# 用中位数填充 Salary
df['Salary'].fillna(df['Salary'].median(), inplace=True)

# 用众数填充 Department
df['Department'].fillna(df['Department'].mode()[0], inplace=True)
print(df)

    Age  Salary Department
0  25.0  5000.0         IT
1  30.0  7250.0         HR
2  32.5  7000.0    Finance
3  40.0  8000.0         IT
4  35.0  7500.0         IT


### 前向/后向填充

In [9]:
# 前向填充
df_ffill = df.fillna(method='ffill')

# 后向填充
df_bfill = df.fillna(method='bfill')
print(df)

    Age  Salary Department
0  25.0  5000.0         IT
1  30.0  7250.0         HR
2  32.5  7000.0    Finance
3  40.0  8000.0         IT
4  35.0  7500.0         IT


## 1.2异常值检测

### Z-Score 方法

In [11]:
import numpy as np
import pandas as pd

# 构造数据
data = [10, 12, 11, 13, 12, 200, 11, 10, 12]
df = pd.DataFrame(data, columns=['Value'])

# Z-Score 方法
mean, std = df['Value'].mean(), df['Value'].std()
df['Z-Score'] = (df['Value'] - mean) / std
outliers = df[np.abs(df['Z-Score']) > 2]
print(outliers)

   Value   Z-Score
5    200  2.666335


### IQR 方法（四分位距法）

In [12]:
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1

outliers_iqr = df[(df['Value'] < Q1 - 1.5*IQR) | (df['Value'] > Q3 + 1.5*IQR)]
print(outliers_iqr)

   Value   Z-Score
5    200  2.666335


### 基于距离的方法

In [13]:
from sklearn.neighbors import NearestNeighbors

X = df[['Value']].values
nbrs = NearestNeighbors(n_neighbors=2).fit(X)
distances, indices = nbrs.kneighbors(X)
df['KNN_Dist'] = distances[:, 1]
print(df.sort_values(by='KNN_Dist', ascending=False))

   Value   Z-Score  KNN_Dist
5    200  2.666335     187.0
3     13 -0.307450       1.0
0     10 -0.355158       0.0
1     12 -0.323353       0.0
2     11 -0.339255       0.0
4     12 -0.323353       0.0
6     11 -0.339255       0.0
7     10 -0.355158       0.0
8     12 -0.323353       0.0


### 基于密度的方法

In [14]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=2)
y_pred = lof.fit_predict(X)
df['LOF'] = y_pred
print(df)

   Value   Z-Score  KNN_Dist  LOF
0     10 -0.355158       0.0    1
1     12 -0.323353       0.0    1
2     11 -0.339255       0.0   -1
3     13 -0.307450       1.0   -1
4     12 -0.323353       0.0    1
5    200  2.666335     187.0   -1
6     11 -0.339255       0.0   -1
7     10 -0.355158       0.0    1
8     12 -0.323353       0.0    1


### 基于模型的方法

In [15]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.1, random_state=42)
y_pred = iso.fit_predict(X)
df['IForest'] = y_pred
print(df)

   Value   Z-Score  KNN_Dist  LOF  IForest
0     10 -0.355158       0.0    1        1
1     12 -0.323353       0.0    1        1
2     11 -0.339255       0.0   -1        1
3     13 -0.307450       1.0   -1        1
4     12 -0.323353       0.0    1        1
5    200  2.666335     187.0   -1       -1
6     11 -0.339255       0.0   -1        1
7     10 -0.355158       0.0    1        1
8     12 -0.323353       0.0    1        1


## 1.3数据标准化与归一化

### 数据标准化

In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 构造数据
data = pd.DataFrame({
    'Height': [160, 170, 180, 190],
    'Weight': [50, 65, 80, 95]
})

scaler = StandardScaler()
standardized = scaler.fit_transform(data)

df_std = pd.DataFrame(standardized, columns=['Height', 'Weight'])
print(df_std)

     Height    Weight
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641


### 数据归一化

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized = scaler.fit_transform(data)

df_norm = pd.DataFrame(normalized, columns=['Height', 'Weight'])
print(df_norm)

     Height    Weight
0  0.000000  0.000000
1  0.333333  0.333333
2  0.666667  0.666667
3  1.000000  1.000000


###  Robust Scaler（稳健标准化）

In [19]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
robust_scaled = scaler.fit_transform(data)
print(robust_scaled)

[[-1.         -1.        ]
 [-0.33333333 -0.33333333]
 [ 0.33333333  0.33333333]
 [ 1.          1.        ]]


### MaxAbs Scaler

In [20]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
scaled = scaler.fit_transform(data)
print(scaled)

[[0.84210526 0.52631579]
 [0.89473684 0.68421053]
 [0.94736842 0.84210526]
 [1.         1.        ]]


# 2.特征工程

## 2.1 特征选择

### 过滤法

In [24]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2, f_classif

# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 卡方检验选择前2个特征
chi2_selector = SelectKBest(score_func=chi2, k=2)
X_chi2 = chi2_selector.fit_transform(X, y)
print(X_chi2)

# 方差分析选择前2个特征
f_selector = SelectKBest(score_func=f_classif, k=2)
X_f = f_selector.fit_transform(X, y)
# print(X_f)

[[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.2]
 [1.2 0.2]
 [1.3 0.2]
 [1.4 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3 1. ]
 [4.6 1.3]
 [3.9 1.4]
 [3.5 1. ]
 [4.2 1.5]
 [4.  1. ]
 [4.7 1.4]
 [3.6 1.3]
 [4.4 1.4]
 [4.5 1.5]
 [4.1 1. ]
 [4.5 1.5]
 [3.9 1.1]
 [4.8 1.8]
 [4.  1.3]
 [4.9 1.5]
 [4.7 1.2]
 [4.3 1.3]
 [4.4 1.4]
 [4.8 1.4]
 [5.  1.7]
 [4.5 1.5]
 [3.5 1. ]
 [3.8 1.1]
 [3.7 1. ]
 [3.9 1.2]
 [5.1 1.6]
 [4.5 1.5]
 [4.5 1.6]
 [4.7 1.5]
 [4.4 1.3]
 [4.1 1.3]
 [4.  1.3]
 [4.4 1.2]

### 包裹法（Wrapper Method）

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=200)
rfe = RFE(model, n_features_to_select=2)
X_rfe = rfe.fit_transform(X, y)
print("选出的特征索引:", rfe.support_)

选出的特征索引: [False False  True  True]


### 嵌入法（Embedded Method）

In [26]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)
lasso.fit(X, y)
print("特征系数:", lasso.coef_)

特征系数: [ 0.         -0.          0.40811896  0.        ]


## 2.2特征构造

### 数值特征变换

In [27]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# 构造数据
data = pd.DataFrame({'x1': [1,2,3], 'x2': [4,5,6]})

poly = PolynomialFeatures(degree=2, include_bias=False)
data_poly = poly.fit_transform(data)
print(data_poly)

[[ 1.  4.  1.  4. 16.]
 [ 2.  5.  4. 10. 25.]
 [ 3.  6.  9. 18. 36.]]


### 类别特征编码与构造

In [28]:
import pandas as pd

data = pd.DataFrame({'City': ['Beijing', 'Shanghai', 'Beijing', 'Shenzhen'],
                     'Sales': [200, 150, 300, 100]})

# 目标编码
target_mean = data.groupby('City')['Sales'].mean()
data['City_encoded'] = data['City'].map(target_mean)
print(data)

       City  Sales  City_encoded
0   Beijing    200         250.0
1  Shanghai    150         150.0
2   Beijing    300         250.0
3  Shenzhen    100         100.0


### 时间序列特征构造

In [29]:
import pandas as pd

data = pd.DataFrame({'date': pd.date_range('2025-01-01', periods=4),
                     'sales':[100,150,200,130]})
data['day_of_week'] = data['date'].dt.dayofweek
data['month'] = data['date'].dt.month
print(data)

        date  sales  day_of_week  month
0 2025-01-01    100            2      1
1 2025-01-02    150            3      1
2 2025-01-03    200            4      1
3 2025-01-04    130            5      1


### 文本特征构造

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["I love ML", "ML is fun"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts).toarray()
print(X)

[[0 0 1 1]
 [1 1 0 1]]


## 2.3特征降维

### 主成分分析

In [31]:
import numpy as np
from sklearn.decomposition import PCA

# 构造数据
X = np.array([[2.5, 2.4],
              [0.5, 0.7],
              [2.2, 2.9],
              [1.9, 2.2]])

# PCA 降到 1 维
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X)
print(X_pca)

[[-0.75001714]
 [ 1.85685726]
 [-0.91187236]
 [-0.19496777]]


### 线性判别分析

In [33]:
# 用的是 iris 数据集
from sklearn.datasets import load_iris
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

data = load_iris()
X = data.data   # shape (150, 4)
y = data.target # shape (150,)

lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X, y)
print(X_lda.shape)


(150, 1)


### 独立成分分析

In [34]:
from sklearn.decomposition import FastICA

ica = FastICA(n_components=2)
X_ica = ica.fit_transform(X)
print(X_ica)

[[ 0.11208659  0.04115307]
 [ 0.10428391 -0.04103825]
 [ 0.11178914 -0.0364984 ]
 [ 0.10296023 -0.06453198]
 [ 0.11398582  0.04217582]
 [ 0.10375838  0.11265398]
 [ 0.11008323 -0.02702847]
 [ 0.10696444  0.01561383]
 [ 0.10383623 -0.10812746]
 [ 0.10379013 -0.03040642]
 [ 0.11096866  0.09575963]
 [ 0.10374152 -0.00890266]
 [ 0.10608152 -0.050957  ]
 [ 0.11841144 -0.09852939]
 [ 0.12609317  0.18338843]
 [ 0.11873266  0.21084504]
 [ 0.11858544  0.12263131]
 [ 0.11053091  0.04006074]
 [ 0.10293098  0.13474571]
 [ 0.11179606  0.07366791]
 [ 0.09858321  0.05466947]
 [ 0.10858307  0.06054175]
 [ 0.12978058  0.00810884]
 [ 0.09298465  0.00632544]
 [ 0.09262123 -0.01638566]
 [ 0.09662846 -0.03501583]
 [ 0.10014632  0.01093485]
 [ 0.1081379   0.04966981]
 [ 0.11018736  0.04013031]
 [ 0.10066884 -0.0439814 ]
 [ 0.09876961 -0.04500416]
 [ 0.10288539  0.05747349]
 [ 0.1196374   0.12296513]
 [ 0.12272002  0.1694342 ]
 [ 0.10223445 -0.03149875]
 [ 0.11477013 -0.00097084]
 [ 0.11482566  0.08769171]
 

### 非线性降维方法

In [36]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
print(X_tsne)



[[-10.356388  -20.781525 ]
 [-12.905783  -20.975285 ]
 [-13.004583  -19.894701 ]
 [-13.305361  -20.06607  ]
 [-10.352813  -21.153378 ]
 [ -8.3866005 -21.317993 ]
 [-12.7004595 -19.312492 ]
 [-11.03638   -20.409721 ]
 [-14.076265  -20.1664   ]
 [-12.465     -20.72987  ]
 [ -8.748954  -20.629807 ]
 [-11.705105  -19.823538 ]
 [-13.225353  -20.727112 ]
 [-14.280882  -19.610226 ]
 [ -7.426455  -20.801826 ]
 [ -7.3525176 -21.377144 ]
 [ -8.184059  -21.0027   ]
 [-10.261443  -20.585182 ]
 [ -7.911658  -20.495127 ]
 [ -9.343588  -21.341665 ]
 [ -9.235463  -19.740711 ]
 [ -9.567434  -21.093042 ]
 [-12.894333  -18.521933 ]
 [-10.5527935 -19.341864 ]
 [-11.591852  -19.181208 ]
 [-12.571847  -21.159492 ]
 [-10.777901  -19.772024 ]
 [ -9.835854  -20.423218 ]
 [-10.1718445 -20.34272  ]
 [-12.496177  -19.89243  ]
 [-12.543187  -20.295843 ]
 [ -9.318307  -19.81155  ]
 [ -8.313686  -21.780094 ]
 [ -7.683668  -21.407215 ]
 [-12.389074  -20.597292 ]
 [-11.909472  -21.184679 ]
 [ -8.7242565 -20.001366 ]
 