Utilizing the Spy Technique in PU Learning to Obtain Reliable Negative Samples

In [None]:
import pandas as pd

# Positive examples
df_pos = pd.read_csv('./data/pos_data.csv')
# Negative examples
df_neg = pd.read_csv('./data/U-data.gz')
# Sample 15% of the positive examples as the Spy(S) set.
df_spy = df_pos.sample(int(len(df_pos)*0.15), random_state=42)
df_pos_ = df_pos.drop(df_spy.index)
print(len(df_spy), len(df_pos_))
# Use P-S as the positive sample and U+S as the negative sample to construct the training data for FastText.
data = []
for row in df_pos_.itertuples():
    data.append(f'__label__1 {row.content.lower()}')
for row in df_spy.itertuples():
    data.append(f'__label__0 {row.content.lower()}')
for row in df_neg.itertuples():
    data.append(f'__label__0 {row.content.lower()}')
with open(r'./data/ftt-data.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(data))

Training a classification model using fastText

In [29]:
import fasttext
model = fasttext.train_supervised(r'./data/ftt-data.txt', label='__label__', wordNgrams=2, minCount=3, epoch=10, dim=300, thread=30)
model.save_model("./data/ftt-clf.model")

In [None]:
from fasttext import load_model

# Load the trained fasttext model to predict the probability of samples in the Spy set being positive, and use the 1st percentile of the probabilities as the threshold.
model = load_model("./data/ftt-clf.model")
contents = [_.lower() for _ in df_spy['content']]
pred = model.predict(contents)
probs = []
for i, j in zip(pred[0], pred[1]):
    if i[0]=='__label__0':
        probs.append(1-j[0])
    else:
        probs.append(j[0])
n = int(len(contents)*0.01)
t = sorted(probs)[n]

# Samples from the set U where the predicted probability of being a positive sample is below the threshold are considered reliable negative samples.
contents_U = [_.lower() for _ in df_neg['content']]
pred_U = model.predict(contents_U)
lis = []
for i, j, c in zip(pred_U[0], pred_U[1], contents_U):
    p = 1-j[0] if i[0]=='__label__0' else j[0]
    if p<t:
        lis.append([c, p])
print(len(lis))       
df_pos = pd.read_csv('./data/pos_data.csv')
pos = [i.lower() for i in df_pos['content']]

lis = sorted(lis, key=lambda x:x[1])
neg = [_[0] for _ in lis[:len(pos)]]
neg_df = pd.DataFrame({'content':neg})
print(len(neg))
neg_df.to_csv('./data/neg_data.csv', index=False)

# Building a classification dataset.
contents = pos + neg
ids = list(range(len(contents)))
tags = [1]*len(df_pos)+[0]*len(neg)
df = pd.DataFrame({'id':ids, 'tag':tags, 'content':contents})
n_valid = int(len(df)*0.1)
n_test = int(len(df)*0.1)
valid_df = df.sample(n_valid, random_state=42)
print(len(valid_df[valid_df.tag==1]), len(valid_df[valid_df.tag==0]))
df = df.drop(valid_df.index)
test_df = df.sample(n_test, random_state=42)
print(len(test_df[test_df.tag==1]), len(test_df[test_df.tag==0]))
train_df = df.drop(test_df.index)
print(len(train_df[train_df.tag==1]), len(train_df[train_df.tag==0]))
df_ = pd.concat([train_df, valid_df, test_df])
df_['type'] = ['train']*len(train_df)+['valid']*len(valid_df)+['test']*len(test_df)
df_.to_csv('./data/clf-data.csv', index=False)