<a href="https://colab.research.google.com/github/akiyomov/llm_experiments/blob/main/text_classification_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip archive.zip

Archive:  archive.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW,DataCollatorWithPadding
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
# Load training and testing datasets from separate CSV files
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data.sample(5)

Unnamed: 0,Class Index,Title,Description
16321,4,Site Tracks Political Zeitgeist,With 24-hour-a-day talk on all things politica...
38445,3,"Freddie, Fannie shares feel fresh pressure",WASHINGTON (CBS.MW) -- Shares of Freddie Mac f...
68873,4,Software patents raise hackles in Britain,A U.K. foundation gathers testimony from Briti...
96667,4,Bland Mountain salvage talks set,The public is invited to an informal open hous...
99658,3,McDonald #39;s CEO leaves to fight cancer,"The chief executive of McDonald #39;s, Charlie..."


In [3]:
train_texts, train_labels = train_data['Description'], train_data['Class Index']
test_texts, test_labels = test_data['Description'], test_data['Class Index']

In [4]:
train_texts

0         Reuters - Short-sellers, Wall Street's dwindli...
1         Reuters - Private investment firm Carlyle Grou...
2         Reuters - Soaring crude prices plus worries\ab...
3         Reuters - Authorities have halted oil export\f...
4         AFP - Tearaway world oil prices, toppling reco...
                                ...                        
119995     KARACHI (Reuters) - Pakistani President Perve...
119996    Red Sox general manager Theo Epstein acknowled...
119997    The Miami Dolphins will put their courtship of...
119998    PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...
119999    INDIANAPOLIS -- All-Star Vince Carter was trad...
Name: Description, Length: 120000, dtype: object

In [5]:
# Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(data, max_length=512):
    # Initialize the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize each text in the data using tqdm for progress display
    tokenized_data = []
    for text in tqdm(data, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'  # Return PyTorch tensors
        )
        tokenized_data.append(encoded)

    return tokenized_data

In [6]:
train_encodings = tokenize_data(train_texts)
test_encodings = tokenize_data(test_texts)

Tokenizing: 100%|██████████| 120000/120000 [02:45<00:00, 726.23it/s]
Tokenizing: 100%|██████████| 7600/7600 [00:10<00:00, 748.93it/s]


In [7]:
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    train_encodings, shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    test_encodings, batch_size=16, collate_fn=data_collator
)

In [8]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([16, 1, 512]),
 'attention_mask': torch.Size([16, 1, 512])}

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [11]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

22500


In [12]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
writer = SummaryWriter()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    tqdm_dataloader = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for step, batch in enumerate(tqdm_dataloader):
        input_ids = batch['input_ids'].squeeze(dim=1).to(device)
        attention_mask = batch['attention_mask'].squeeze(dim=1).to(device)
        targets = torch.randint(0, 2, (input_ids.size(0),)).to(device)  # Change this to your actual targets

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        tqdm_dataloader.set_postfix({'loss': total_loss / (step + 1)}, refresh=True)

        # Log the loss to TensorBoard
        writer.add_scalar('Training Loss', loss.item(), epoch * len(train_dataloader) + step + 1)

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

    # Save the model after each epoch
    torch.save(model.state_dict(), f'bert_model_epoch_{epoch + 1}.pt')

# Close the TensorBoard writer
writer.close()

Epoch 1/3:   4%|▍         | 303/7500 [07:16<2:54:17,  1.45s/it, loss=0.704]

In [None]:
# Evaluation loop
model.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].squeeze(dim=1).to(device)
        attention_mask = batch['attention_mask'].squeeze(dim=1).to(device)
        targets = torch.randint(0, 2, (input_ids.size(0),)).to(device)  # Change this to your actual targets

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_targets, all_predictions)

print(f"Accuracy: {accuracy:.4f}")