In [1]:
BASE_PATH = "/kaggle/input/pii-detection-removal-from-educational-data"

In [2]:
file_test = BASE_PATH + "/test.json"

In [3]:
import sys, os
import json
import numpy as np
import pandas as pd
from functools import partial
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch

2024-04-22 17:52:05.295946: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 17:52:05.296052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 17:52:05.400754: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
df_test = pd.read_json(file_test)

In [5]:
len(df_test)

10

In [6]:
MODEL = '/kaggle/input/roberta-base-g7/transformers/roberta-g7/1'

In [7]:
MAXLEN = 512

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [9]:
model = AutoModelForTokenClassification.from_pretrained('/kaggle/input/roberta-base-g7/transformers/roberta-g7/1')

In [10]:

def tokenize_test_data(example):
    tokenized_inputs = tokenizer(
        example["tokens"], 
        padding='max_length',  # Pad to maximum length
        truncation=True, 
        is_split_into_words=True, 
        max_length=MAXLEN,
        return_tensors="pt"  # Ensure the output is PyTorch tensors
    )
    return tokenized_inputs

# Assuming df_test contains your test data
# Tokenize test data
tokenized_test = []
for i in range(len(df_test)):
    tokenized_example = tokenize_test_data(df_test.iloc[i])
    tokenized_test.append(tokenized_example)

# Make predictions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)  # Move model to the appropriate device

# Place model in evaluation mode
model.eval()

# Define label list
label_list = ['O',
              'B-NAME_STUDENT',
              'I-NAME_STUDENT',
              'B-EMAIL',
              'I-EMAIL',
              'B-USERNAME',
              'I-USERNAME',
              'B-ID_NUM',
              'I-ID_NUM',
              'B-PHONE_NUM',
              'I-PHONE_NUM',
              'B-URL_PERSONAL',
              'I-URL_PERSONAL',
              'B-STREET_ADDRESS',
              'I-STREET_ADDRESS']

# Initialize list to store predictions
predictions = []

# Iterate over tokenized test data
for example in tokenized_test:
    inputs = {key: value.to(device) for key, value in example.items()}
    with torch.no_grad():  # No need to compute gradients during inference
        output = model(**inputs)
    
    # Get predicted labels from logits
    predicted_labels = output.logits.argmax(dim=-1).cpu().numpy()
    
    # Append predicted labels to predictions list
    predictions.append(predicted_labels)

# Convert predictions to numpy array
predictions = np.concatenate(predictions)

# Post-processing: Extract relevant information and format as desired
predicted_df = pd.DataFrame(columns=['row_id', 'document', 'token', 'label'])
row_id = 0

# Iterate over each example in the test dataset
for i, example_labels in enumerate(predictions):
    # Iterate over tokens in the example
    for j, label_id in enumerate(example_labels):
        # Only include positive PII label values (excluding O)
        if label_list[label_id] != 'O':
            predicted_df.loc[row_id] = [row_id, df_test['document'][i], j, label_list[label_id]]
            row_id += 1

# Save predictions to file
predicted_df.to_csv('submission.csv', index=False)