In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd

# # load dataset
# # dataset = load_dataset("ai4privacy/pii-masking-300k")
# train = load_dataset("ai4privacy/pii-masking-300k", split = "train")
# test = load_dataset("ai4privacy/pii-masking-300k", split = "validation")

# # only English
# train = train.filter(lambda x: x["language"] == "English")
# test = test.filter(lambda x: x["language"] == "English")

# # convert to pandas dataframe
# train_df = pd.DataFrame(train)
# test_df = pd.DataFrame(test)

# # create binary labels where 1 = sensitive data and 0 = not sensitive
# train_df["is_sensitive"] = train_df["privacy_mask"].apply(lambda x: 1 if len(x) > 0 else 0)
# test_df["is_sensitive"] = test_df["privacy_mask"].apply(lambda x: 1 if len(x) > 0 else 0)

from google.colab import drive
drive.mount('/content/drive')
train_df = pd.read_pickle("/content/drive/My Drive/CMPE 257/CMPE 257 Colab/257 Sensitive Data Input Guardrail/train_df_embedding.pkl")
test_df = pd.read_pickle("/content/drive/My Drive/CMPE 257/CMPE 257 Colab/257 Sensitive Data Input Guardrail/test_df_embedding.pkl")

Mounted at /content/drive


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# tf-idf vectorization on source_text, 2300 max_features produced best accuracy
vectorizer = TfidfVectorizer(max_features = 2300, ngram_range = (1,2), stop_words = "english")

X_train = vectorizer.fit_transform(train_df["source_text"])
X_test = vectorizer.transform(test_df["source_text"])

y_train = train_df["is_sensitive"]
y_test = test_df["is_sensitive"]

# logistic regression, 20 iterations produced best accuracy
clf = LogisticRegression(max_iter = 20, class_weight = "balanced", n_jobs = -1)

clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print("Accuracy: " + str(accuracy_score(y_test, y_pred)))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8504908129876667

Classification Report:

              precision    recall  f1-score   support

           0       0.43      0.91      0.58       908
           1       0.99      0.84      0.91      7038

    accuracy                           0.85      7946
   macro avg       0.71      0.88      0.75      7946
weighted avg       0.92      0.85      0.87      7946



In [23]:
# Embeddings and PCA with 80% variance

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# building feature matrices from embedding column
X_train = np.vstack(train_df["embedding"].values)  # shape - (n_train, embedding_dim)
X_test  = np.vstack(test_df["embedding"].values)   # shape - (n_test, embedding_dim)

y_train = train_df["is_sensitive"].values
y_test  = test_df["is_sensitive"].values


# scaling embeddings before logistic regression
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


# fit pca with 114 components
pca = PCA(n_components = 114) # 114 components to capture 80% of variance
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


# logistic regression on embeddings
# more iterations, since embeddings can be high-dim
clf = LogisticRegression(max_iter = 30, class_weight = "balanced", n_jobs = -1)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8672287943619431

Classification Report:

              precision    recall  f1-score   support

           0       0.46      0.92      0.61       908
           1       0.99      0.86      0.92      7038

    accuracy                           0.87      7946
   macro avg       0.72      0.89      0.77      7946
weighted avg       0.93      0.87      0.88      7946

