In [1]:
import re
import torch
import pickle
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DebertaTokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer

### Check Device

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


### Load Model

In [3]:
deberta_model_dir = "./model/deberta-base/models--microsoft--deberta-base/snapshots/0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195"
tokenizer = DebertaTokenizer.from_pretrained(deberta_model_dir)
model = DebertaForSequenceClassification.from_pretrained(deberta_model_dir, num_labels=2)
model.to(device)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at ./model/deberta-base/models--microsoft--deberta-base/snapshots/0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

### Load Dataset

In [4]:
train_data = pd.read_csv("./data/clean_train.csv")
test_data = pd.read_csv("./data/clean_test.csv")
train_data['label'] = train_data['target']
dataset = Dataset.from_pandas(train_data)

In [5]:
data = pd.read_csv("./data/clean_final_dataset.csv")
data['label'] = data['label'].astype('int64')

In [6]:
dataset = Dataset.from_pandas(data)
# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=200)

# Preprocess the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=False)

# Set the dataset format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split the dataset into training and evaluation sets
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Map:   0%|          | 0/6718 [00:00<?, ? examples/s]

Train dataset size: 5374
Eval dataset size: 1344


### Define Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir='./DeBERTa_results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    max_steps=150,
    logging_dir='./logs.log',
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training
)




### Define compute metrics function

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

### Create Trainer instance

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


### Train the model

In [10]:
torch.cuda.empty_cache()

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.349,0.301403,0.87872,0.844697,0.8463,0.845498


TrainOutput(global_step=150, training_loss=0.3543388223648071, metrics={'train_runtime': 47.9168, 'train_samples_per_second': 100.174, 'train_steps_per_second': 3.13, 'total_flos': 574870498560000.0, 'train_loss': 0.3543388223648071, 'epoch': 0.8928571428571429})

### Evaluation

In [12]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3014028072357178, 'eval_accuracy': 0.8787202380952381, 'eval_precision': 0.8446969696969697, 'eval_recall': 0.8462998102466793, 'eval_f1': 0.8454976303317535, 'eval_runtime': 7.2713, 'eval_samples_per_second': 184.836, 'eval_steps_per_second': 46.209, 'epoch': 0.8928571428571429}


### Test Dataset

In [13]:
test_data = pd.read_csv("./data/tweets_testset.csv")

In [14]:
test_data.head(5)

Unnamed: 0,text,label,location,disaster
0,Calling all #developers! \n\nInnovate with our...,0,,
1,"Switzerland BANS face coverings in public, inc...",0,Switzerland,
2,Touchdown Mason meets Touchdown Jesus. \n\n#No...,0,,
3,Have you ever heard about population density o...,0,,
4,This is not divided.,0,,


In [15]:
test_data['label'] = test_data['label'].astype('int64')
test_dataset = Dataset.from_pandas(test_data)

In [16]:
encoded_new_test_dataset = test_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
encoded_new_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [17]:
predictions = trainer.predict(encoded_new_test_dataset)

In [19]:
print(predictions.metrics) 

{'test_loss': 0.5035289525985718, 'test_accuracy': 0.8535031847133758, 'test_precision': 0.5871559633027523, 'test_recall': 0.9846153846153847, 'test_f1': 0.735632183908046, 'test_runtime': 1.7921, 'test_samples_per_second': 175.213, 'test_steps_per_second': 44.082}


### DeBERTa Result
- train_batch_size:32
- By max steps:140
- acc: 0.8535
- precision: 0.587
- recall: 0.9846
- f1: 0.73

In [27]:
logits = predictions.predictions
probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
predicted_labels = torch.argmax(probabilities, dim=1).numpy()

In [28]:
test_data['predicted_label'] = predicted_labels

In [29]:
test_data.head(10)

Unnamed: 0,text,label,location,disaster,predicted_label
0,Calling all #developers! \n\nInnovate with our...,0,,,0
1,"Switzerland BANS face coverings in public, inc...",0,Switzerland,,0
2,Touchdown Mason meets Touchdown Jesus. \n\n#No...,0,,,0
3,Have you ever heard about population density o...,0,,,0
4,This is not divided.,0,,,0
5,Love this,0,,,0
6,Very revealing.\n\n$10.4m vs $582m. \n\nThe di...,0,,,0
7,Donald Trump has offered to help pay off the $...,0,,,0
8,MAGA: “The world loves Donald Trump.”\n\nScotl...,0,,,0
9,They love each other so much Oprah charged Kam...,0,,,0


In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

true_labels = test_data['label'].values
predicted_labels = test_data['predicted_label'].values
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8535
Precision: 0.9107
Recall: 0.8535
F1 Score: 0.8649


In [43]:
incorrect_rows = test_data[test_data['label'] != test_data['predicted_label']]
incorrect_rows = incorrect_rows.reset_index(drop=True)
len(incorrect_rows)

46

In [49]:
incorrect_rows.head(10)

Unnamed: 0,text,label,location,disaster,predicted_label
0,BREAKING: The FEMA supervisor who instructed d...,0,,,1
1,BREAKING: Florida Governor Ron DeSantis has d...,0,Florida,hurricane,1
2,no one’s saying that you just want an excuse t...,1,,kill people,0
3,Remember when conservatives said Superstorm Sa...,0,,,1
4,From a Canadian firefighter who knows what’s g...,0,Canada,wildfire,1
5,"During the 2018 wildfires, this man captured h...",0,,wildfire,1
6,4. Morning drive to work during Wildfires,0,,,1
7,This is a photo I saved from 2012 of a bobcat ...,0,,wildfire,1
8,"Almost a year ago, Oprah and Johnson faced bac...",0,,wildfire,1
9,"""Please make no mistake. Climate change is the...",0,,,1


In [54]:
incorrect_rows = incorrect_rows[['text','label', 'predicted_label']]

In [55]:
import csv
incorrect_rows.to_csv("./bad_case/DeBERTa_bad_case2.csv", index=False)

- acc: 0.8598
- precision: 0.6
- recall: 0.9692
- f1: 0.7412

### Bad Case Output - Test Dataset

In [24]:
bad_case_indices = [i for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)) if true != pred]
bad_cases = [eval_dataset[i] for i in bad_case_indices]

### Save Model

In [None]:
output_dir = './DeBERTa_downloads/result'
if trainer.model is not None:
    trainer.model.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")
else:
    print("Trainer model Load ERROR")

if tokenizer is not None:
    tokenizer.save_pretrained(output_dir)
    print(f"Tokenizer saved to {output_dir}")
else:
    print("Trainer tokenizer Load ERROR")

#### To Use Saved Model 

In [None]:
saved_model_dir = './DeBERTa_downloads/result' # Replace with your model's save directory
try:
    model = RobertaForSequenceClassification.from_pretrained(saved_model_dir)
    tokenizer = RobertaTokenizer.from_pretrained(saved_model_dir)
    print("Model and tokenizer reloaded successfully!")
except Exception as e:
    print(f"Error reloading model or tokenizer: {e}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the appropriate device
model.eval()  # Set the model to evaluation mode

In [None]:
predictions = []
with torch.no_grad():
    for batch in test_data:
        inputs = {key: batch[key].to(device) for key in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)  # Forward pass
        logits = outputs.logits  # Logits output
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices
        predictions.extend(batch_predictions)  # Store predictions