In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
import os
import missingno as msno #缺失值可视化 https://github.com/ResidentMario/missingno

# 设置随机种子，确保结果可重复
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 1.数据查看

In [None]:
data = pd.read_csv("D:/vscode_work/mechine learning/IHC_data.csv")

#查看分类变量的频数分布
categorical_features = ['outcome', 'sex', 'nation', 'infection', 'infected_coition', 'biopsy',
                        'syringe', 'disease', 'previous_treatment', 'treatment_history',
                        'treatment', 'nuc', 'IFN', 'A', 'FL', 'FL_grading', 'A_24w', 'FL_24w',
                        'FL_grade_24w']
print("\n分类变量的频数分布:")
for var in categorical_features:
    print(f"\n{var} 的频数分布:")
    print(data[var].value_counts(dropna=False))

#绘制连续变量箱线图
# continuous_variables = [col for col in data.columns if col not in categorical_features]
# print(len(continuous_variables)) #连续变量的个数
# plt.figure(figsize=(110, 200))
# for i, var in enumerate(continuous_variables, 1):
#     plt.subplot(11, 10, i)
#     sns.boxplot(y=data[var].dropna())
#     plt.title(var)
# plt.tight_layout()
# plt.savefig('D:/vscode_work/mechine learning/连续变量箱线图.png', dpi=300, bbox_inches='tight') #和上面的plt.figure(figsize=(18, 90))一起运行

print("\n查看数据缺失情况")
missing_percentage = (data.isnull().sum() / len(data))*100
print("/特征缺失百分比\n", missing_percentage.to_string)  

# 缺失值可视化
msno.matrix(data, labels=True, label_rotation=90)  
plt.show()

plt.figure(figsize=(15, 10))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('heaatmap', fontsize=16)
plt.show() #显示缺失值热力图

# 2.处理异常值

In [None]:
# 定义分类变量和连续变量
categorical_features = ['outcome', 'sex', 'nation', 'infection', 'infected_coition', 'biopsy',
                        'syringe', 'disease', 'previous_treatment', 'treatment_history',
                        'treatment', 'nuc', 'IFN', 'A', 'FL', 'FL_grading', 'A_24w', 'FL_24w',
                        'FL_grade_24w']
continuous_variables = [col for col in data.columns if col not in categorical_features]

#异常值情况
for var in continuous_variables:
    q1 = data[var].quantile(0.25)
    q3 = data[var].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    outliers = data[(data[var] < lower_bound) | (data[var] > upper_bound)]
    
    print(f"\n{var} 的异常值数：{outliers.shape[0]}")  # 打印异常值个数
    if not outliers.empty:
        print(outliers[[var]])  # 只展示当前变量的异常值列，防止输出太多

    # 把异常值变成缺失值
    data.loc[outliers.index, var] = np.nan

#删除异常值后的数据摘要
print("\n删除异常值后的数据摘要：")
print(data.describe())
print(data.info())  # 查看数据结构

# 删除异常值后的数据缺失情况
print("\n删除异常值后的变量缺失情况：")
missing_percentage = (data.isnull().sum() / len(data)) * 100
print(missing_percentage.to_string()) #输出缺失值百分比

# 删除缺失超过60%的列
columns_to_drop = missing_percentage[missing_percentage > 30].index
print(f"\n将要删除的列：{list(columns_to_drop)}")

data = data.drop(columns=columns_to_drop)

print(f"\n删除后剩余的列数：{data.shape[1]}")
data.to_csv("D:/vscode_work/mechine learning/IHC.csv", index=False)

In [18]:
# 加载数据
data = pd.read_csv("D:/vscode_work/mechine learning/IHC_data.csv")

# --- 步骤1：定义分类变量和连续变量 ---
# 分类变量列表（根据实际列名调整）
categorical_features = [
    'outcome', 'sex', 'nation', 'infection', 'infected_coition', 'infected_person', 'biopsy',
    'syringe', 'disease', 'previous_treatment', 'treatment_history',
    'treatment', 'nuc', 'IFN', 'A', 'FL', 'FL_grading', 'A_24w', 'FL_24w',
    'FL_grade_24w'
]

# 连续变量 = 所有列 - 分类变量
continuous_variables = [col for col in data.columns if col not in categorical_features]

# --- 步骤2：使用Tukey方法检测和处理异常值 ---
for var in continuous_variables:
    # 跳过非数值型列（确保安全）
    if var not in data.select_dtypes(include=[np.number]).columns:
        continue
    
    # 计算四分位数和IQR
    q1 = data[var].quantile(0.25)
    q3 = data[var].quantile(0.75)
    iqr = q3 - q1
    
    # 计算上下界（Tukey方法默认用1.5倍IQR）
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # 检测异常值（直接标记为布尔索引）
    is_outlier = (data[var] < lower_bound) | (data[var] > upper_bound)
    num_outliers = is_outlier.sum()  # 统计异常值数量
    
    # 打印结果（仅保留异常值数量，避免输出过多内容）
    print(f"{var}: 异常值数量 = {num_outliers}")
    
    # 将异常值替换为NaN（直接操作原DataFrame）
    # data.loc[is_outlier, var] = np.nan
    data[var] = data[var].clip(lower=lower_bound, upper=upper_bound)#  将异常值替换为边界值

data.to_csv("D:/vscode_work/mechine learning/IHC_Tukey.csv", index=False)

# --- 步骤3：处理缺失值（删除高缺失率列） ---
# 计算每列的缺失率
missing_percentage = (data.isnull().sum() / len(data)) * 100

# 标记缺失率超过30%的列
columns_to_drop = missing_percentage[missing_percentage > 35].index.tolist()

# 删除高缺失率列（如果存在）
if columns_to_drop:
    print(f"\n删除缺失率超过30%的列: {columns_to_drop}")
    data = data.drop(columns=columns_to_drop)
else:
    print("\n没有列需要删除")

# --- 步骤4：保存处理后的数据 ---
# 输出最终数据信息
print("\n处理后数据摘要:")
print(data.info())

# 保存到新文件（避免覆盖原始数据）
output_path = "D:/vscode_work/mechine learning/IHC_cleaned.csv"
data.to_csv(output_path, index=False)
print(f"\n数据已保存至: {output_path}")

age: 异常值数量 = 0
height: 异常值数量 = 0
weight: 异常值数量 = 4
SBP: 异常值数量 = 33
DBP: 异常值数量 = 20
medical_history: 异常值数量 = 14
D: 异常值数量 = 119
HBsAg: 异常值数量 = 8
HBsAb: 异常值数量 = 87
HBeAg: 异常值数量 = 7
HBeAb: 异常值数量 = 138
HBcAb: 异常值数量 = 152
ALT: 异常值数量 = 0
AST: 异常值数量 = 23
TBIL: 异常值数量 = 31
TP: 异常值数量 = 12
ALB: 异常值数量 = 21
BUN: 异常值数量 = 12
Cr: 异常值数量 = 6
GLU: 异常值数量 = 21
DBIL: 异常值数量 = 30
HB: 异常值数量 = 21
WBC: 异常值数量 = 18
ANC: 异常值数量 = 21
PLT: 异常值数量 = 13
ANC_per: 异常值数量 = 43
T3: 异常值数量 = 2
T4: 异常值数量 = 0
FT3: 异常值数量 = 30
FT4: 异常值数量 = 47
TSH: 异常值数量 = 17
TPOAb: 异常值数量 = 88
TGAb: 异常值数量 = 74
TRAb: 异常值数量 = 18
AFP: 异常值数量 = 32
E: 异常值数量 = 9
CAP: 异常值数量 = 7
D_12w: 异常值数量 = 94
HBsAg_12w: 异常值数量 = 69
HBsAb_12w: 异常值数量 = 85
HBeAg_12w: 异常值数量 = 14
HBeAb_12w: 异常值数量 = 91
HBcAb_12w: 异常值数量 = 102
ALT_12w: 异常值数量 = 37
AST_12w: 异常值数量 = 32
TBIL_12w: 异常值数量 = 23
TP_12w: 异常值数量 = 10
ALB_12w: 异常值数量 = 11
BUN_12w: 异常值数量 = 4
Cr_12w: 异常值数量 = 2
GLU_12w: 异常值数量 = 18
DBIL_12w: 异常值数量 = 20
HB_12w: 异常值数量 = 4
WBC_12w: 异常值数量 = 35
ANC_12w: 异常值数量 = 33
PLT_12w: 异常值数量 = 22
AN

# 3.正态性检验

In [None]:
from scipy import stats
data = pd.read_csv("D:/vscode_work/mechine learning/IHC.csv")
categorical_features = ['outcome', 'sex', 'nation', 'infection', 'infected_person', 'biopsy',
                        'syringe', 'disease', 'treatment', 'IFN'] #重新 定义分类变量
continuous_variables = [col for col in data.columns if col not in categorical_features]
results = []
for col in continuous_variables:
    colum_data = data[col].dropna()
    # 执行多种检验
    shapiro_stat, shapiro_p = stats.shapiro(colum_data)
    dagostino_stat, dagostino_p = stats.normaltest(colum_data)
    
    results.append({
        "Column": col,
        "Shapiro-Wilk p": round(shapiro_p, 4),
        "D'Agostino p": round(dagostino_p, 4),
        "Normality (Shapiro)": "Yes" if shapiro_p > 0.05 else "No",
        "Normality (D'Agostino)": "Yes" if dagostino_p > 0.05 else "No"
    })

results_df = pd.DataFrame(results)
print(results_df)

# 4.缺失值插补

In [19]:
data  = pd.read_csv("D:/vscode_work/mechine learning/IHC_cleaned.csv")
# 连续变量插补
categorical_features = ['outcome', 'sex', 'nation', 'infection', 'infected_person',
                        'treatment', 'IFN'] #重新 定义分类变量
continuous_variables = [col for col in data.columns if col not in categorical_features] #重新 定义数值变量

continuous_imputer = IterativeImputer(random_state=RANDOM_STATE, max_iter=10, min_value=0, sample_posterior=True)
continuous_data_imputed = pd.DataFrame(continuous_imputer.fit_transform(data[continuous_variables]), columns=continuous_variables)

# 分类变量插补
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_data_imputed = pd.DataFrame(categorical_imputer.fit_transform(data[categorical_features]), columns=categorical_features)

# 合并
completed_data = pd.concat([continuous_data_imputed, categorical_data_imputed], axis=1)

# 查看填补后数据的缺失情况
completed_data.isnull().sum()

# 注意整数变量的填补值也要保留整数
integer_columns = ['outcome', 'sex', 'nation', 'height', 'weight', 'infection', 'infected_person', 'treatment', 'IFN', 'HB', 'PLT', 
                        'HB_12w', 'PLT_12w', 'HB_24w', 'PLT_24w']

completed_data[integer_columns] = completed_data[integer_columns].round()

# 保存填补后的数据
completed_data.to_csv("D:/vscode_work/mechine learning/IHC_im.csv", index=False)

In [None]:
data =pd.read_csv("D:/vscode_work/mechine learning/IHC_im.csv")
data.isnull().sum()
sns.heatmap(data.isna(), cbar=False, cmap='viridis')
plt.show()