Data Problem using BERT model

In [None]:
!pip install torch transformers pandas nltk #installing necessary libraries

import pandas as pd
import numpy as np
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.preprocessing import LabelEncoder


nltk.download(['punkt', 'stopwords', 'wordnet', 'punkt_tab'])


from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

# Preprocessing the text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Preprocessed data
print("Preprocessing text...")
data['processed_text'] = (data['title'].fillna('') + ' ' + data['description'].fillna('')).apply(preprocess_text)


X = data['processed_text'].values
y = data['SOC2'].values

# Data getting splitted
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# this will take time
# for each epoch there will be 32 points batch so like for 1000 there will be 32 batches
# this will make the code run epoch*batches times

MAX_LEN = 128
batch_size = 32

# Tokenizing data
train_encodings = tokenizer(
    X_train.tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

#test data tokenization
test_encodings = tokenizer(
    X_test.tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)


train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(y_train)
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(y_test)
)
#dataloaders creation
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import BertForSequenceClassification
import torch
from torch.optim import AdamW
from sklearn.metrics import classification_report

# Ensuring labels are zero-indexed and contiguous BEFORE splitting
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

#splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")


unique_labels, counts = np.unique(y, return_counts=True)
print("Unique labels:", unique_labels)
print("Label counts:", counts)

#initializing BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_classes,
    output_attentions=False,
    output_hidden_states=False
)

# Setting up training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 1

# Training loop

print("Starting training...")
print("Starting training...")
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }


        inputs['labels'] = torch.clamp(inputs['labels'], 0, num_classes - 1)

        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Average loss: {avg_train_loss:.4f}")

# Evaluation
print("\nEvaluating model...")
model.eval()
predictions, true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2]
    }

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.extend(np.argmax(logits, axis=1).flatten())
    true_labels.extend(label_ids.flatten())

predictions = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(true_labels)


print("\nClassification Report:")
print(classification_report(true_labels, predictions))

# Calculating accuracy
accuracy = np.mean(np.array(predictions) == np.array(true_labels))
print(f"Overall Accuracy: {accuracy:.4f}")