# 读取数据

In [1]:
import pandas as pd

df = pd.read_csv('data/pure_MS2_pearson.csv')
df

Unnamed: 0,CR,Lms,SSA,Dap,Rct,PW,CD,Cs
0,1.39,2.56,64.8,7.5,0.5,0.5,1.0,431.82
1,1.39,2.56,64.8,7.5,0.5,0.5,2.0,398.88
2,1.39,2.56,64.8,7.5,0.5,0.5,3.0,312.18
3,1.39,2.56,64.8,7.5,0.5,0.5,5.0,272.00
4,1.39,2.56,64.8,7.5,0.5,0.5,7.0,230.70
...,...,...,...,...,...,...,...,...
204,1.54,2.41,21.1,,3.1,1.0,1.0,129.20
205,1.54,2.41,21.1,,3.1,1.0,1.5,115.10
206,1.54,2.41,21.1,,3.1,1.0,3.0,102.30
207,1.54,2.41,21.1,,3.1,1.0,5.0,92.00


# 检查缺失值

In [2]:
df.isnull().sum()

CR       0
Lms      0
SSA      5
Dap    132
Rct     54
PW       0
CD       0
Cs       0
dtype: int64

Dap缺失值较多，可以考虑删除该列

In [3]:
df = df.drop('Dap', axis=1)
df

Unnamed: 0,CR,Lms,SSA,Rct,PW,CD,Cs
0,1.39,2.56,64.8,0.5,0.5,1.0,431.82
1,1.39,2.56,64.8,0.5,0.5,2.0,398.88
2,1.39,2.56,64.8,0.5,0.5,3.0,312.18
3,1.39,2.56,64.8,0.5,0.5,5.0,272.00
4,1.39,2.56,64.8,0.5,0.5,7.0,230.70
...,...,...,...,...,...,...,...
204,1.54,2.41,21.1,3.1,1.0,1.0,129.20
205,1.54,2.41,21.1,3.1,1.0,1.5,115.10
206,1.54,2.41,21.1,3.1,1.0,3.0,102.30
207,1.54,2.41,21.1,3.1,1.0,5.0,92.00


# 使用特殊值填充缺失值

In [4]:
df_fill_f1 = df.fillna(-1)
df_fill_f1

Unnamed: 0,CR,Lms,SSA,Rct,PW,CD,Cs
0,1.39,2.56,64.8,0.5,0.5,1.0,431.82
1,1.39,2.56,64.8,0.5,0.5,2.0,398.88
2,1.39,2.56,64.8,0.5,0.5,3.0,312.18
3,1.39,2.56,64.8,0.5,0.5,5.0,272.00
4,1.39,2.56,64.8,0.5,0.5,7.0,230.70
...,...,...,...,...,...,...,...
204,1.54,2.41,21.1,3.1,1.0,1.0,129.20
205,1.54,2.41,21.1,3.1,1.0,1.5,115.10
206,1.54,2.41,21.1,3.1,1.0,3.0,102.30
207,1.54,2.41,21.1,3.1,1.0,5.0,92.00


# 使用KNN填充缺失值

In [5]:
from sklearn.impute import KNNImputer

# 分离特征和目标变量
X = df.drop('Cs', axis=1)
y = df['Cs']

# 使用KNN插补缺失值
imputer = KNNImputer(n_neighbors=2)
X_imputed_knn = imputer.fit_transform(X)

# 将插补后的数据与目标值重新组合
df_fill_knn = pd.DataFrame(X_imputed_knn, columns=X.columns)
df_fill_knn['Cs'] = y
df_fill_knn

Unnamed: 0,CR,Lms,SSA,Rct,PW,CD,Cs
0,1.39,2.56,64.8,0.5,0.5,1.0,431.82
1,1.39,2.56,64.8,0.5,0.5,2.0,398.88
2,1.39,2.56,64.8,0.5,0.5,3.0,312.18
3,1.39,2.56,64.8,0.5,0.5,5.0,272.00
4,1.39,2.56,64.8,0.5,0.5,7.0,230.70
...,...,...,...,...,...,...,...
204,1.54,2.41,21.1,3.1,1.0,1.0,129.20
205,1.54,2.41,21.1,3.1,1.0,1.5,115.10
206,1.54,2.41,21.1,3.1,1.0,3.0,102.30
207,1.54,2.41,21.1,3.1,1.0,5.0,92.00


# 使用多重插补填充缺失值

In [6]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 使用多重插补填补缺失值
iterative_imputer = IterativeImputer(max_iter=10, random_state=21)
X_imputed_iterative = iterative_imputer.fit_transform(X)

# 将插补后的数据与目标值重新组合
df_fill_iterative = pd.DataFrame(X_imputed_iterative, columns=X.columns)
df_fill_iterative['Cs'] = y
df_fill_iterative

Unnamed: 0,CR,Lms,SSA,Rct,PW,CD,Cs
0,1.39,2.56,64.8,0.5,0.5,1.0,431.82
1,1.39,2.56,64.8,0.5,0.5,2.0,398.88
2,1.39,2.56,64.8,0.5,0.5,3.0,312.18
3,1.39,2.56,64.8,0.5,0.5,5.0,272.00
4,1.39,2.56,64.8,0.5,0.5,7.0,230.70
...,...,...,...,...,...,...,...
204,1.54,2.41,21.1,3.1,1.0,1.0,129.20
205,1.54,2.41,21.1,3.1,1.0,1.5,115.10
206,1.54,2.41,21.1,3.1,1.0,3.0,102.30
207,1.54,2.41,21.1,3.1,1.0,5.0,92.00


In [7]:
df_nofill = df.drop('Rct', axis=1)
df_nofill = df_nofill.dropna()
df_nofill

Unnamed: 0,CR,Lms,SSA,PW,CD,Cs
0,1.39,2.56,64.8,0.5,1.0,431.82
1,1.39,2.56,64.8,0.5,2.0,398.88
2,1.39,2.56,64.8,0.5,3.0,312.18
3,1.39,2.56,64.8,0.5,5.0,272.00
4,1.39,2.56,64.8,0.5,7.0,230.70
...,...,...,...,...,...,...
204,1.54,2.41,21.1,1.0,1.0,129.20
205,1.54,2.41,21.1,1.0,1.5,115.10
206,1.54,2.41,21.1,1.0,3.0,102.30
207,1.54,2.41,21.1,1.0,5.0,92.00


In [8]:
df_fill_f1.to_csv('data/fill_f1.csv', index=False)
df_fill_knn.to_csv('data/fill_knn.csv', index=False)
df_fill_iterative.to_csv('data/fill_iterative.csv', index=False)
df_nofill.to_csv('data/nofill.csv', index=False)