In [2]:
# 导入必要的库
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.svm import SVC

# 读取数据
df = pd.read_csv('result_resolve.csv')

# 设置标签和数据
known_labels_ratio = 1
label_1_data = df[df['ISPOTIENTIAL'] == 1].sample(frac=known_labels_ratio, random_state=42)
label_minus_1_data = df[df['ISPOTIENTIAL'] == -1]

data_P = label_1_data.drop(['ID', 'ISPOTIENTIAL','DATADATE','FZHZM','DZHZM'], axis=1).to_numpy()
data_U = label_minus_1_data.drop(['ID', 'ISPOTIENTIAL','DATADATE','FZHZM','DZHZM'], axis=1).to_numpy()

# 样本数量
NP = data_P.shape[0]
NU = data_U.shape[0]
print(NP,NU)


17176 1031399


In [None]:
# 迭代次数和每次采样数量
T = 20
K = NP

# 初始化数组用于记录 out of bag (oob) 样本的数量和概率得分
n_oob = np.zeros(shape=(NU,))
f_oob = np.zeros(shape=(NU,))

# 开始迭代
for i in range(T):
    print(f"第{i}次迭代")
    # Bootstrap 采样
    bootstrap_sample = np.random.choice(np.arange(NU), replace=True, size=K)
    # Positive set + bootstrapped unlabeled set
    data_bootstrap = np.concatenate((data_P, data_U[bootstrap_sample, :]), axis=0)

    # 构建训练标签
    train_label_bootstrap = np.zeros(shape=(K + NP,))
    train_label_bootstrap[:NP] = 1.0

    # 训练 SVM 模型
    # 使用随机森林模型
    model = SVC(probability=True, class_weight='balanced', gamma='auto')

    model.fit(data_bootstrap, train_label_bootstrap)

    # Index for the out of bag (oob) samples
    idx_oob = sorted(set(range(NU)) - set(np.unique(bootstrap_sample)))

    # Transductive learning of oob samples
    f_oob[idx_oob] += model.predict_proba(data_U[idx_oob])[:, 1]
    n_oob[idx_oob] += 1

# 计算最终的概率得分
predict_proba = f_oob / n_oob

In [None]:
# 绘制图形
fig = plt.figure(figsize=(6, 4))
ax1 = fig.add_subplot(1, 1, 1)
sp = ax1.scatter(df.loc[df['ISPOTIENTIAL'] == -1, 'ID'], predict_proba,
                linewidth=0, s=5, alpha=0.5, cmap=plt.cm.plasma, label='unlabeled')
plt.grid()
plt.colorbar(sp, label='Class probability on Unlabeled set')

plt.show()

In [None]:
# 将所有字段及新增的 Predict_Proba 字段保存到新的 文件
df['Predict_Proba'] = np.nan
df.loc[df['ISPOTIENTIAL'] == 1, 'Predict_Proba'] = 1.0
df.loc[df['ISPOTIENTIAL'] == -1, 'Predict_Proba'] = predict_proba

In [None]:
# 统计在0到1之间每隔0.1的区间内的数量
predict_proba = df['Predict_Proba']
hist, bin_edges = np.histogram(predict_proba, bins=np.arange(-0.1, 1.2, 0.1))

# 计算等于0的数量
count_below_threshold_zero = np.sum(predict_proba == 0)
# 计算小于0.1的数量
count_below_threshold_nonzero = np.sum((predict_proba < 0.1) & (predict_proba != 0))
# 计算大于0.1的数量
count_below_threshold_non = np.sum((predict_proba > 0.1))

# 打印结果
print(f"等于0的数量：{count_below_threshold_zero}")
print(f"小于0.1并且不等于0的数量：{count_below_threshold_nonzero}")
# 打印结果
print(f"大于0.1的数量：{count_below_threshold_non}")
# 打印每个区间的数量
for i in range(len(hist)):
    print(f"区间 {bin_edges[i]:.1f} 到 {bin_edges[i+1]:.1f} 的数量：{hist[i]}")
