# Classification using HuggingFace's recommended setup

### 1) Configuration

In [None]:
import torch
import numpy as np

RANDOM_SEED = 42

BATCH_SIZE = 16
NUM_EPOCHS = 2
N_LABELS = 3
CHECKPOINT = 'neuralmind/bert-base-portuguese-cased'
CHECKPOINT = 'bert-base-multilingual-cased'
OUTPUTS_FOLDER = 'DS1/bert-base-multilingual-cased'
DATASET_FOLDER = 'DS1'

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=N_LABELS)
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(instance):
    return tokenizer(instance['tweet'], truncation=True)

### 2) Dataset loading

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    'csv',
    data_files={
        'train': f'./datasets/{DATASET_FOLDER}/train_dataset.csv',
        f'val': f'./datasets/{DATASET_FOLDER}/val_dataset.csv',
        f'test': f'./datasets/{DATASET_FOLDER}/test_dataset.csv',
    }
)
dataset

### 3) Tokenization and preprocessing

In [None]:
tokenized_data = dataset.map(tokenize_function, batched=True)
tokenized_data

In [None]:
original_test_data = tokenized_data['test']
tokenized_data = tokenized_data.remove_columns(
    ['tweet']
)
tokenized_data = tokenized_data.rename_column('label', 'labels')
tokenized_data.set_format('torch')
tokenized_data['train'].column_names

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_data['train'], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)
val_dataloader = DataLoader(
    tokenized_data['val'], batch_size=BATCH_SIZE, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_data['test'], batch_size=BATCH_SIZE, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


def compute_metrics(metrics):
    for name, metric in metrics.items():
        if name == 'accuracy':
            print(f'  {name[0].upper()}{name[1:]}:', "{0:.4f}".format(metric.compute()[name]))
            continue
        print(f'  {name[0].upper()}{name[1:]}:', "{0:.4f}".format(metric.compute(average='weighted')[name]))
    print('')

In [None]:
from transformers import AdamW, get_scheduler
from datasets import load_metric

# Set the seed value all over the place to make this reproducible.
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

optimizer = AdamW(model.parameters(), lr=3e-5)

model.to(device)

num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Measure the total training time for the whole run.
total_t0 = time.time()

metrics = {
    'accuracy': load_metric('accuracy'),
    'precision': load_metric('precision'),
    'recall': load_metric('recall'),
    'f1': load_metric('f1'),
}

for epoch_i in range(NUM_EPOCHS):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, NUM_EPOCHS))
    
    # ========================================
    #               Training
    # ========================================
    print('')
    print('Training...')
    
    # Measure how long the training epoch takes.
    t0 = time.time()
    
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Progress update every 20 batches.
        if step % 20 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
    # Measure how long this training epoch took.
    training_time = format_time(time.time() - t0)
    print('')
    print("  Training took: {:}".format(training_time))
 

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print('')
    print('Validation...')
    
    t0 = time.time()    
    
    eval_metrics = metrics
    
    model.eval()
    
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        for _, metric in eval_metrics.items():
            metric.add_batch(predictions=predictions, references=batch['labels'])

    compute_metrics(eval_metrics)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    print("  Validation took: {:}".format(validation_time))

print('Training complete!')
print('Total training took {:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))

In [None]:
model.save_pretrained(save_directory=f'./outputs/{OUTPUTS_FOLDER}/model')

In [None]:
print('Testing...')

t0 = time.time()    

eval_metrics = metrics

model.eval()

test_results = {
    'tweet': [],
    'got': [],
    'expected': [],
}

for i, batch in enumerate(test_dataloader):
    
    batch = {k: v.to(device) for k, v in batch.items()}    
        
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    for j in range(len(batch['labels'])):
        index = (i * BATCH_SIZE) + j
        test_results['tweet'].append(original_test_data['tweet'][index])
        test_results['got'].append(predictions[j].item())
        test_results['expected'].append(batch['labels'][j].item())
        
        # This is just for testing
        if batch['labels'][j] != original_test_data['label'][index]:
            print('FALSE')

    for _, metric in eval_metrics.items():
        metric.add_batch(predictions=predictions, references=batch['labels'])

compute_metrics(eval_metrics)

# Measure how long the testing run took.
testing_time = format_time(time.time() - t0)
print("  Testing took: {:}".format(testing_time))

In [None]:
import pandas as pd

test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv(f'./outputs/{OUTPUTS_FOLDER}/test_results.csv', index=None)
test_results_df