In [1]:
import re
import torch
import pickle
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

### Check Device

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


### Load Model

In [17]:
roberta_model_dir = "./model/roberta-base/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b"
tokenizer = RobertaTokenizer.from_pretrained(roberta_model_dir)
model = RobertaForSequenceClassification.from_pretrained(roberta_model_dir, num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./model/roberta-base/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### Load Dataset

In [18]:
train_data = pd.read_csv("./data/clean_train.csv")
test_data = pd.read_csv("./data/clean_test.csv")
train_data['label'] = train_data['target']
dataset = Dataset.from_pandas(train_data)

In [19]:
data = pd.read_csv("./data/clean_final_dataset.csv")
data['label'] = data['label'].astype('int64')

In [20]:
dataset = Dataset.from_pandas(data)
# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=200)

# Preprocess the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=False)

# Set the dataset format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split the dataset into training and evaluation sets
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Map:   0%|          | 0/6718 [00:00<?, ? examples/s]

Train dataset size: 5374
Eval dataset size: 1344


### Define Training Arguments

In [21]:
training_args = TrainingArguments(
    output_dir='./RoBERTa_results',
    save_total_limit=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    #gradient_accumulation_steps=2,
    per_device_eval_batch_size=4,
    #num_train_epochs=2,
    max_steps=200,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training
)



### Define compute metrics function

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

### Trainer

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [24]:
torch.cuda.empty_cache()

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.339,0.387184,0.864583,0.808429,0.837302,0.822612


TrainOutput(global_step=200, training_loss=0.45067530155181884, metrics={'train_runtime': 16.981, 'train_samples_per_second': 94.223, 'train_steps_per_second': 11.778, 'total_flos': 164444409600000.0, 'train_loss': 0.45067530155181884, 'epoch': 0.2976190476190476})

### Evaluation 

In [26]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3871840834617615, 'eval_accuracy': 0.8645833333333334, 'eval_precision': 0.8084291187739464, 'eval_recall': 0.8373015873015873, 'eval_f1': 0.8226120857699805, 'eval_runtime': 3.5261, 'eval_samples_per_second': 381.154, 'eval_steps_per_second': 95.288, 'epoch': 0.2976190476190476}


### RoBERTa Result
##### Self-Extracted Dataset:
- train_batch_size: 8
- epoch: 2
- acc: 0.8646
- precision: 0.8084
- recall: 0.8373
- f1: 0.8226

### Test Dataset

In [27]:
test_data = pd.read_csv("./data/tweets_testset.csv")
test_data['label'] = test_data['label'].astype('int64')
test_dataset = Dataset.from_pandas(test_data)

In [28]:
encoded_new_test_dataset = test_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
encoded_new_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [29]:
predictions = trainer.predict(encoded_new_test_dataset)

In [30]:
print(predictions.metrics) 

{'test_loss': 0.5888678431510925, 'test_accuracy': 0.856687898089172, 'test_precision': 0.5925925925925926, 'test_recall': 0.9846153846153847, 'test_f1': 0.7398843930635838, 'test_runtime': 0.8843, 'test_samples_per_second': 355.083, 'test_steps_per_second': 89.336}


- acc: 0.8567
- precision: 0.5926
- recall: 0.9846
- f1: 0.7399

### BadCase Output - Test Dataset

In [31]:
true_labels = predictions.label_ids
predicted_labels = predictions.predictions.argmax(-1)

In [32]:
bad_case_indices = [i for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)) if true != pred]
bad_cases = [eval_dataset[i] for i in bad_case_indices]

In [33]:
len(bad_case_indices)

45

In [34]:
bad_list = []
true_label = []
predicted_label = []
for idx, case in enumerate(bad_cases):
    bad_list.append(tokenizer.decode(case['input_ids'], skip_special_tokens=True))
    true_label.append(true_labels[bad_case_indices[idx]])
    predicted_label.append(predicted_labels[bad_case_indices[idx]])

In [35]:
import csv
df = pd.DataFrame({
    'Text': bad_list,
    'True_Label': true_label,
    'Predicted_Label': predicted_label
})
df.to_csv('./bad_case/RoBERTa_bad_case.csv', index=False)

### Save Model

In [None]:
output_dir = './RoBERTa_downloads/result'
if trainer.model is not None:
    trainer.model.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")
else:
    print("Trainer model Load ERROR")

if tokenizer is not None:
    tokenizer.save_pretrained(output_dir)
    print(f"Tokenizer saved to {output_dir}")
else:
    print("Trainer tokenizer Load ERROR")

#### To Use Saved Model 

In [None]:
saved_model_dir = './RoBERTa_downloads/result' # Replace with your model's save directory
try:
    model = RobertaForSequenceClassification.from_pretrained(saved_model_dir)
    tokenizer = RobertaTokenizer.from_pretrained(saved_model_dir)
    print("Model and tokenizer reloaded successfully!")
except Exception as e:
    print(f"Error reloading model or tokenizer: {e}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the appropriate device
model.eval()  # Set the model to evaluation mode

In [None]:
predictions = []
with torch.no_grad():
    for batch in test_data:
        inputs = {key: batch[key].to(device) for key in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)  # Forward pass
        logits = outputs.logits  # Logits output
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices
        predictions.extend(batch_predictions)  # Store predictions