[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bertabenet/mapping_emergencyrelevant_social_media/blob/main/trainer/deBERTa.ipynb)

In [None]:
# install libraries for using pretrained models
!pip install simpletransformers
!pip freeze | grep simpletransformers
!pip install tokenizers==0.10.1

In [None]:
# imports
import pandas as pd
from google.colab import drive

from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from scipy.special import softmax

drive.mount('/content/drive')

# Read train and test data
Train and test data must be already split in two CSV files with at least a `text` column and a `crisis_label` column. Example:

| id | text | crisis_label |
| :---: | :---: | :---: |
| 123456789 | This is an example text | 0 |
| 987654321 | This text is crisis-related | 1 |
| ... | ... | ... |

In [None]:
# open train file
train_path = ""                                          # add your path
train = pd.read_csv(open(train_path, 'r'), sep=',')      # read train file
train = train.reset_index(drop=True)

texts = []
labels = []
for num, x in enumerate(train['crisis_label']):
    if x == 0:
        texts.append(str(train['text'][num]))
        labels.append(0)
    elif x == 1:
        texts.append(str(train['text'][num]))
        labels.append(1)
    else:
        continue

train = pd.DataFrame(list(zip(texts, labels)), columns=['text', 'label'])

# open test file
test_path = ""                                          # add your path
test = pd.read_csv(open(test_path, 'r'), sep=',')       # read test file
test = test.reset_index(drop=True)

texts = []
labels = []
for num, x in enumerate(test['crisis_label']):
    if x == 0:
        texts.append(str(test['text'][num]))
        labels.append(0)
    elif x == 1:
        texts.append(str(test['text'][num]))
        labels.append(1)
    else:
        continue

test = pd.DataFrame(list(zip(texts, labels)), columns=['text', 'label'])

print('train shape: ', train.shape)
print('test shape: ', test.shape)

In [None]:
# check data if labels contains just two classes
def datacheck(data):
    for x in data['label']:
        if x != 0 and x != 1:
            print (x)

datacheck(train)
datacheck(test)

# Define and train model

In [None]:
# define hyperparameters
output_path = "outputs/"    # add path where result will be stored
train_args ={"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "num_train_epochs": 5,
             "output_dir": output_path}

# create a ClassificationModel and load distilbert of bert pretrained models
model = ClassificationModel(
    "deberta", "microsoft/deberta-base",
    num_labels=2,
    args=train_args, 
    use_cuda=True # use_cuda if you use GPU
)

In [None]:
# train model
model.train_model(train)

# Evaluate model

In [None]:
# evaluate model
def auc(labels, preds):
    return roc_auc_score(labels, preds)
    
result, model_outputs, wrong_predictions = model.eval_model(test, roc_auc=auc, acc=accuracy_score)
print(result)