# Checking to see if we can build classifiers

In [16]:
import pandas as pd

In [17]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from torch.optim import AdamW

def train_bert(df):
    # Split the data
    train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=.2)

    # Tokenization
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True, return_tensors="pt")
    val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True, return_tensors="pt")

    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.values))
    val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.values))



    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(df['label'])))
    device = torch.device("mps")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for epoch in range(3):  # adjust epochs based on your needs
        for batch in tqdm(train_loader):
            # Unpack the batch and load it to MPS
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            model.eval()

    predictions, true_labels = [], []
    for batch in val_loader:
        batch = [b.to(device) for b in batch]
        inputs, masks, labels = batch
    
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks)
    
        logits = outputs.logits
        predictions.extend(logits.detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())

    # Convert predictions to 1D array for metric computation
    flat_predictions = np.argmax(predictions, axis=1).flatten()
    flat_true_labels = np.array(true_labels).flatten()

    # 9. Compute Metrics
    accuracy = accuracy_score(flat_true_labels, flat_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average='binary')
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

    # return metrics
    return model, metrics


### Training on textfooler dataset

In [18]:
df_textfooler = pd.read_csv('/Users/ananthmuppidi/IIIT_H/semester_6/RSAI/project/data/attack_clean/textfooler-dataset.csv') 
df_textfooler.columns = ['text', 'label'] 

In [19]:
model, metrics = train_bert(df_textfooler)
metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 23/23 [00:22<00:00,  1.01it/s]
100%|██████████| 23/23 [00:22<00:00,  1.04it/s]
100%|██████████| 23/23 [00:21<00:00,  1.05it/s]


{'accuracy': 0.6888888888888889,
 'f1': 0.6626506024096386,
 'precision': 0.7534246575342466,
 'recall': 0.5913978494623656}

### Training on hotflip dataset

In [20]:
df_textfooler = pd.read_csv('/Users/ananthmuppidi/IIIT_H/semester_6/RSAI/project/data/attack_clean/hotflip-dataset.csv') 
df_textfooler.columns = ['text', 'label'] 

In [21]:
model, metrics = train_bert(df_textfooler)
metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5/5 [00:11<00:00,  2.31s/it]
100%|██████████| 5/5 [00:03<00:00,  1.33it/s]
100%|██████████| 5/5 [00:03<00:00,  1.35it/s]


{'accuracy': 0.29411764705882354,
 'f1': 0.25,
 'precision': 0.3333333333333333,
 'recall': 0.2}