In [3]:
from datasets import load_dataset
import numpy as np
import pandas as pd
dataset = load_dataset("ai4privacy/pii-masking-300k")
train = load_dataset("ai4privacy/pii-masking-300k", split = "train")
test = load_dataset("ai4privacy/pii-masking-300k", split = "validation")

train = train.filter(lambda x: x["language"] == "English")
test = test.filter(lambda x: x["language"] == "English")

train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

train_df["is_sensitive"] = train_df["privacy_mask"].apply(lambda x: 1 if len(x) > 0 else 0)
test_df["is_sensitive"] = test_df["privacy_mask"].apply(lambda x: 1 if len(x) > 0 else 0)

In [20]:
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# prepare the data
X_train_texts = train_df["source_text"]
y_train = train_df["is_sensitive"]

X_test_texts = test_df["source_text"]
y_test = test_df["is_sensitive"]

# vectorize text with TF-IDF
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

# train xgboost classifier
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=None,
    learning_rate=0.15,
    n_estimators=500,
    subsample=0.8, # random rows to prevent overfitting and add diversity
    colsample_bytree=0.8, # random columns to reduces feature correlation
    use_label_encoder=False,
    random_state=42
)

xgb_clf.fit(X_train, y_train)

# evaluate
y_pred = xgb_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))


feature_names = vectorizer.get_feature_names_out()
importances = xgb_clf.feature_importances_
indices = np.argsort(importances)[::-1][:20]

print("\nTop 20 Important Words for Sensitive Detection:")
for i in indices:
    print(f"{feature_names[i]}: {importances[i]:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9181978353888749

Classification Report:
              precision    recall  f1-score   support

           0      0.687     0.522     0.593       908
           1      0.940     0.969     0.955      7038

    accuracy                          0.918      7946
   macro avg      0.814     0.746     0.774      7946
weighted avg      0.911     0.918     0.913      7946


Top 20 Important Words for Sensitive Detection:
com: 0.0154
city: 0.0130
number: 0.0098
passport: 0.0086
road: 0.0075
username: 0.0073
country: 0.0071
ip: 0.0068
eng: 0.0066
street: 0.0061
sex: 0.0054
11: 0.0051
license: 0.0044
field: 0.0041
hesitate: 0.0039
clock: 0.0039
id: 0.0037
07: 0.0037
postcode: 0.0036
employees: 0.0033
