In [5]:
!pip install peft



In [6]:
from transformers import BertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification
import torch
from torch.optim import AdamW
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast, GradScaler

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/drive/MyDrive/data/.DS_Store
/content/drive/MyDrive/data/tweet-hatred-speech/test.csv
/content/drive/MyDrive/data/tweet-hatred-speech/train.csv


In [9]:
df = pd.read_csv('/content/drive/MyDrive/data/tweet-hatred-speech/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/tweet-hatred-speech/test.csv')

In [12]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [47]:
def processed_tweets(tweet, tokenizer, max_length=128):
    return tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=max_length,    # Pad & truncate all sentences
        padding='max_length',     # Pad to max_length
        truncation=True,          # Truncate to max_length
        return_attention_mask=True,  # Construct attn. masks
        return_tensors='pt',      # Return pytorch tensors
    )


In [48]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Upsample class 1 in the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(train_df['tweet'].values.reshape(-1, 1), train_df['label'])
train_resampled_df = pd.DataFrame({'tweet': X_train_resampled.flatten(), 'label': y_train_resampled})

In [49]:
y_train_resampled.value_counts()

label
0    26736
1    26736
Name: count, dtype: int64

In [50]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        tweet = self.df.iloc[idx]['tweet']
        label = self.df.iloc[idx]['label']

        encoded_tweet = processed_tweets(tweet, self.tokenizer, self.max_length)

        return {
            'input_ids': encoded_tweet['input_ids'].squeeze(),
            'attention_mask': encoded_tweet['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [51]:
train_dataset = TweetsDataset(train_resampled_df, tokenizer)
val_dataset = TweetsDataset(val_df, tokenizer)

# Define data loaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)


In [52]:
train_dataset

<__main__.TweetsDataset at 0x7ced48e0f880>

In [53]:
# Load pre-trained BERT model for sequence classification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
def apply_lora_with_peft(model, rank=32, alpha=32, dropout=0.05):
    lora_config = LoraConfig(
        r=rank,
        lora_alpha=alpha,
        target_modules=["query", "value"],
        lora_dropout=dropout,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    return get_peft_model(model, lora_config)

In [55]:
def fine_tune_bert(train_loader, val_loader, model, epochs, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = apply_lora_with_peft(model).to(device)

    # Define optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Define mixed-precision training scaler
    scaler = GradScaler()

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1} has started')
        total_loss = 0.0
        total_predictions = []
        total_labels = []
        model.train()

        # Training loop
        for batch in train_loader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            # Forward pass with mixed precision
            with autocast():
                outputs = model(inputs, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            # Backward pass with mixed precision
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            # Compute training accuracy
            logits = outputs.logits.detach().cpu().numpy()
            predictions = torch.argmax(torch.from_numpy(logits), axis=1).numpy()
            total_predictions.extend(predictions)
            total_labels.extend(labels.detach().cpu().numpy())

            total_loss += loss.item()

        # Calculate training accuracy
        train_accuracy = accuracy_score(total_labels, total_predictions)
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1} has ended. Average Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}')

        # Validation phase
        model.eval()
        val_predictions = []
        val_labels = []
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with autocast():
                    outputs = model(inputs, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss

                logits = outputs.logits.detach().cpu().numpy()
                predictions = torch.argmax(torch.from_numpy(logits), axis=1).numpy()
                val_predictions.extend(predictions)
                val_labels.extend(labels.detach().cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_predictions)
        print(f'Validation Accuracy: {val_accuracy:.4f}')

In [56]:
import torch
print(torch.cuda.is_available())

True


In [58]:
epochs = 7

# Fine-tune BERT
fine_tune_bert(train_loader, val_loader, model, epochs, tokenizer)



Epoch 1 has started
Epoch 1 has ended. Average Loss: 0.5220, Training Accuracy: 0.7216
Validation Accuracy: 0.8646
Epoch 2 has started
Epoch 2 has ended. Average Loss: 0.3162, Training Accuracy: 0.8679
Validation Accuracy: 0.8708
Epoch 3 has started
Epoch 3 has ended. Average Loss: 0.2832, Training Accuracy: 0.8829
Validation Accuracy: 0.8905
Epoch 4 has started
Epoch 4 has ended. Average Loss: 0.2650, Training Accuracy: 0.8930
Validation Accuracy: 0.8899
Epoch 5 has started
Epoch 5 has ended. Average Loss: 0.2512, Training Accuracy: 0.8963
Validation Accuracy: 0.8908
Epoch 6 has started
Epoch 6 has ended. Average Loss: 0.2410, Training Accuracy: 0.9023
Validation Accuracy: 0.8865
Epoch 7 has started
Epoch 7 has ended. Average Loss: 0.2396, Training Accuracy: 0.9036
Validation Accuracy: 0.8908
