In [17]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForTokenClassification

In [18]:
# Step 1: Load and Prepare Data
data = pd.read_csv('businessCard.csv', sep=',', names=['id', 'text', 'label'], encoding='latin-1')
data

Unnamed: 0,id,text,label
0,id,text,tag
1,000.jpeg,,O
2,000.jpeg,.,O
3,000.jpeg,040-4852,B-PHONE
4,000.jpeg,8881,I-PHONE
...,...,...,...
10441,290.jpeg,,O
10442,290.jpeg,Richard,B-NAME
10443,290.jpeg,Pretorius,I-NAME
10444,290.jpeg,,O


In [19]:
data['text'] = data['text'].fillna('')

In [41]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize each text sample in your DataFrame and store the results in a list
tokenized_texts = [tokenizer.tokenize(text) for text in data['text']]

# Convert tokenized texts back to space-separated strings
tokenized_texts = [' '.join(tokens) for tokens in tokenized_texts]

# Tokenize the text data using the tokenizer
tokens = tokenizer(tokenized_texts, is_split_into_words=True, padding=True, truncation=True, return_tensors='pt')

# Prepare Labels
# Replace 'label' with your actual label column name
labels = [label if label != 'O' else 'O' for label in data['label'].tolist()]  # Replace empty labels with 'O'
label_set = set(labels)  # Unique labels in your dataset
label_to_id = {label: i for i, label in enumerate(label_set)}  # Map labels to numeric IDs
label_ids = [label_to_id[label] for label in labels]  # Convert labels to numeric IDs
labels = torch.tensor(label_ids)

# Model Setup
num_labels = len(label_set)  # Number of unique NER labels in your dataset
# Initialize and load the pre-trained BERT-based model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define the desired sequence length (e.g., 128)
desired_seq_length = 128

# Pad or truncate the input sequences to the desired sequence length
padded_input_ids = [ids[:desired_seq_length] + [tokenizer.pad_token_id] * (desired_seq_length - len(ids)) if len(ids) < desired_seq_length else ids[:desired_seq_length] for ids in tokens['input_ids']]
padded_attention_mask = [mask[:desired_seq_length] + [0] * (desired_seq_length - len(mask)) if len(mask) < desired_seq_length else mask[:desired_seq_length] for mask in tokens['attention_mask']]

# Convert the lists of input sequences and attention masks to NumPy arrays
padded_input_ids = np.array(padded_input_ids)
padded_attention_mask = np.array(padded_attention_mask)

# Pad or truncate the labels to the desired sequence length
padded_labels = [label_ids[:desired_seq_length] + [label_to_id['O']] * max(0, desired_seq_length - len(label_ids)) for label_ids in label_ids]
padded_labels = [label[:desired_seq_length] for label in padded_labels]

# Convert padded_labels to a list of PyTorch tensors
padded_labels = torch.tensor(padded_labels)

# Convert NumPy arrays to PyTorch tensors
padded_input_ids = torch.tensor(padded_input_ids)
padded_attention_mask = torch.tensor(padded_attention_mask)

# Create a DataLoader for training
dataset = TensorDataset(padded_input_ids, padded_attention_mask, padded_labels)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

ValueError: only one element tensors can be converted to Python scalars