<a href="https://colab.research.google.com/github/animesh-rai/x23194545_Sensitive_data_detection/blob/main/sensitive_data_inference_using_all_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting google drive to load finetuned models for senstive data detection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Inference with DistilBERT

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load fine-tuned model and tokenizer
model_path = "/content/drive/My Drive/PII_models/distilbert_sensitive_data_detection_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Input text
text = "Sensitive data such as credit card numbers should be protected."

# Tokenize input
tokens = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512,
    is_split_into_words=False
)

# Perform inference
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
id2label = model.config.id2label  # Mapping from label IDs to label names
predicted_labels = [id2label[label] for label in predictions[0].tolist()]
tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

# Post-process to clean and merge subwords
special_tokens = tokenizer.all_special_tokens
cleaned_results = []
current_word = ""
current_label = None

for token, label in zip(tokens_decoded, predicted_labels):
    if token in special_tokens:
        continue  # Skip special tokens
    if token.startswith("##"):  # Subword continuation
        current_word += token[2:]  # Append without ##
    else:  # New word
        if current_word:  # Add the previous word and label
            cleaned_results.append((current_word, current_label))
        current_word = token  # Start a new word
        current_label = label

# Append the last word
if current_word:
    cleaned_results.append((current_word, current_label))

# Display results
print("\nCleaned Predictions:")
for word, label in cleaned_results:
    print(f"{word}: {label}")


Cleaned Predictions:
sensitive: O
data: O
such: O
as: O
credit: O
card: O
numbers: O
should: O
be: O
protected: O
.: O


### Inference with DeBERTa

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the saved model and tokenizer
save_directory = "/content/drive/My Drive/PII_models/deberta_b_sensitive_data_detection_model"
model = AutoModelForTokenClassification.from_pretrained(
    save_directory,
    ignore_mismatched_sizes=True  # Ignore size mismatches if any
)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Input text for inference
text = "The@ is the mastermind in a criminal.com case loudge on student portal. "

# Tokenize the input
tokens = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512,
    is_split_into_words=False
)

# Move the model and input tokens to the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokens = {key: val.to(device) for key, val in tokens.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
id2label = model.config.id2label  # Ensure this is correctly set in your config
predicted_labels = [id2label[label] for label in predictions[0].tolist()]
tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])


# Display result in nice format# Filter out special tokens and merge subwords

special_tokens = tokenizer.all_special_tokens
word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens_decoded, predicted_labels):
    if token in special_tokens:
        continue  # Skip special tokens

    if token.startswith("▁"):  # New word
        if current_word:  # Append the last word and label
            word_labels.append((current_word, current_label))
        current_word = token.lstrip("▁")  # Remove prefix
        current_label = label
    else:  # Continuation of the previous word
        current_word += token

# Append the last word
if current_word:
    word_labels.append((current_word, current_label))

# Display results
print("Cleaned Output:")
for word, label in word_labels:
    print(f"{word}: {label}")

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at /content/drive/My Drive/PII_models/deberta_b_sensitive_data_detection_model and are newly initialized: ['deberta.encoder.LayerNorm.bias', 'deberta.encoder.LayerNorm.weight', 'deberta.encoder.layer.0.attention.self.key_proj.bias', 'deberta.encoder.layer.0.attention.self.key_proj.weight', 'deberta.encoder.layer.0.attention.self.query_proj.bias', 'deberta.encoder.layer.0.attention.self.query_proj.weight', 'deberta.encoder.layer.0.attention.self.value_proj.bias', 'deberta.encoder.layer.0.attention.self.value_proj.weight', 'deberta.encoder.layer.1.attention.self.key_proj.bias', 'deberta.encoder.layer.1.attention.self.key_proj.weight', 'deberta.encoder.layer.1.attention.self.query_proj.bias', 'deberta.encoder.layer.1.attention.self.query_proj.weight', 'deberta.encoder.layer.1.attention.self.value_proj.bias', 'deberta.encoder.layer.1.attention.self.value_proj.weight', 'deberta.encoder.layer.10.at

Cleaned Output:
The@: B-EMAIL
is: B-ID_NUM
the: B-EMAIL
mastermind: B-ID_NUM
in: B-ID_NUM
a: I-URL_PERSONAL
criminal.com: B-PHONE_NUM
case: I-PHONE_NUM
loudge: I-URL_PERSONAL
on: B-NAME_STUDENT
student: I-URL_PERSONAL
portal.: B-PHONE_NUM


### Inference with RoBERTa

In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the saved model and tokenizer
save_directory = "/content/drive/My Drive/PII_models/roberta_sensitive_data_detection_model"
model = AutoModelForTokenClassification.from_pretrained(
    save_directory,
    ignore_mismatched_sizes=True  # Ignore size mismatches if any
)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Input text for inference
text = "john.doe@gmail.com should be protected 86868547475454"

# Tokenize the input
tokens = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512,
    is_split_into_words=False
)

# Move the model and input tokens to the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokens = {key: val.to(device) for key, val in tokens.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
id2label = model.config.id2label  # Ensure this is correctly set in your config
predicted_labels = [id2label[label] for label in predictions[0].tolist()]
tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

# Post-process to merge subwords and remove special tokens
special_tokens = tokenizer.all_special_tokens
cleaned_results = []
current_word = ""
current_label = None

for token, label in zip(tokens_decoded, predicted_labels):
    if token in special_tokens:
        continue  # Skip special tokens like <s>, </s>
    if token.startswith("Ġ"):  # Start of a new word
        if current_word:  # Append the previous word and label
            cleaned_results.append((current_word, current_label))
        current_word = token.lstrip("Ġ")  # Remove Ġ prefix
        current_label = label
    else:  # Continuation of the previous word
        current_word += token

# Append the last word
if current_word:
    cleaned_results.append((current_word, current_label))

# Display the cleaned results
print("\nCleaned Predictions:")
for word, label in cleaned_results:
    print(f"{word}: {label}")


Cleaned Predictions:
john.doe@gmail.com: B-NAME_STUDENT
should: O
be: O
protected: O
86868547475454: O


### Inference with Longformer

In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Specify the saved directory
save_directory = "/content/drive/My Drive/PII_models/longformer_sensitive_data_detection_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForTokenClassification.from_pretrained(save_directory)

# Move model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

# Input text for inference
text = "Sensitive data like credit card numbers, addresses, or personal emails such as john.doe@gmail.com should be protected."

# Tokenize the input
tokens = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,  # Adjust based on your model's maximum length
    is_split_into_words=False
)

# Move tokens to the same device as the model
tokens = {key: val.to(device) for key, val in tokens.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
id2label = model.config.id2label  # Ensure this mapping exists in the model configuration
tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
predicted_labels = [id2label[label] for label in predictions[0].tolist()]

# Post-process to clean and merge subwords
special_tokens = tokenizer.all_special_tokens
results = []
current_word = ""
current_label = None

for token, label in zip(tokens_decoded, predicted_labels):
    if token in special_tokens:
        continue  # Skip special tokens
    if token.startswith("▁") or token.startswith("Ġ") or not current_word:
        if current_word:  # Append the last word and label
            results.append((current_word, current_label))
        current_word = token.lstrip("▁Ġ")  # Remove prefix
        current_label = label
    else:  # Continuation of the previous word
        current_word += token

# Append the last word
if current_word:
    results.append((current_word, current_label))

# Display results
print("\nCleaned Predictions:")
for word, label in results:
    print(f"{word}: {label}")


Model and tokenizer loaded successfully!

Cleaned Predictions:
Sensitive: O
data: O
like: O
credit: O
card: O
numbers,: O
addresses,: O
or: O
personal: O
emails: O
such: O
as: O
john.doe@gmail.com: B-EMAIL
should: O
be: O
protected.: O
