# **Import**

In [None]:
import pandas as pd
import json

import datasets
from datasets import load_metric, load_dataset, DatasetDict, Dataset

from sklearn.model_selection import train_test_split

from huggingface_hub import notebook_login

import transformers
from transformers.utils import send_example_telemetry 
from transformers import XLMRobertaForSequenceClassification, DataCollatorWithPadding, \
AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import random

from tqdm import tqdm

import matplotlib.pyplot as plt


import torch
from torch.utils.data import DataLoader
import evaluate
import torch


import os
import warnings
import gc 

# **Setting the environment**

In [2]:
os.environ['HF_HOME'] = '../cache/misc'
os.environ['HF_DATASETS_CACHE'] = '../cache/datasets'
os.environ['TRANSFORMERS_CACHE'] = '../cache/models'

---

---

---

# Model and Metric

**Model**

In [3]:
model_path = "../models/second run - xlm-r - wiki + measuring_hs + ToLD + online_hs_recog"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

**Metric**

In [None]:
metric = load_metric('f1')

In [5]:
def compute_metrics(pred):
    predicted_labels = pred.predictions.argmax(axis=-1)
    true_labels = pred.label_ids
    accuracy = metric.compute(predictions=predicted_labels, references=true_labels)
    return accuracy

---

# **Loading data**

**Load the data and create a dataset instance**

In [6]:
dataset_name = 'GermEval-2018'
file_path = '../datasets/' + dataset_name + '.csv'
data_raw = pd.read_csv(file_path)

**Replacing [SEP] with a model specific separation token**

In [None]:
tqdm.pandas()
seprator = '</s><s>'
data_raw['text'] = data_raw['text'].progress_apply(lambda x: x.replace('[SEP]', seprator))

**Splitting data, converting to dataset**

In [8]:
train, test = train_test_split(data_raw, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train.rename_axis(index='index'))
test_dataset = Dataset.from_pandas(test.rename_axis(index='index'))

dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

---

# Tokenizer and Data Collator

In [9]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True, cache_dir = '../cache/models/')

**Data Collator**

In [10]:
data_collator = DataCollatorWithPadding(tokenizer)

**Define the tokenizing function**

In [11]:
def preprocess_data(dataset, tokenizer, cache_dir=None):
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

    # Check if the tokenized data is already cached
    if cache_dir is not None:
        cached_features_file = os.path.join(cache_dir, "cached_features")
        if os.path.exists(cached_features_file):
            tokenized_data = datasets.load_from_disk(cached_features_file)
            return tokenized_data

    # Tokenize the data
    tokenized_data = dataset.map(tokenize_function, batched=True)
    tokenized_data = tokenized_data.remove_columns(["index", "text"])
    tokenized_data = tokenized_data.rename_column("label", "labels")
    tokenized_data = tokenized_data.with_format("torch")

    # Cache the tokenized data if cache_dir is provided
    if cache_dir is not None:
        tokenized_data.save_to_disk(cached_features_file)

    return tokenized_data

**Tokenize!**

In [12]:
cache_dir = dataset_name + '_cache'
tokenized_data = preprocess_data(dataset, tokenizer, cache_dir=cache_dir)

Map:   0%|          | 0/6808 [00:00<?, ? examples/s]

Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6808 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1703 [00:00<?, ? examples/s]

---

# **Trainer**

**Args**

In [13]:
batch_size = 6
steps = 500

training_args = TrainingArguments(
    output_dir= '../checkpoints-xlmr-wiki',
    evaluation_strategy='steps',
    eval_steps=steps,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='steps',
    save_steps=steps
    )

In [14]:
torch.cuda.empty_cache()
gc.collect()

1331

In [15]:
torch.cuda.empty_cache()
gc.collect()

0

**Trainer instance**

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )

# Training time!

In [17]:
torch.cuda.empty_cache()
gc.collect()

0

In [18]:
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
trainer.train() 

# Saving the model

In [21]:
model_path = "../models/second run - xlm-r - wiki-detox + measuring-hate-speech + ToLD + online_hs_recog + GermEval"
trainer.save_model(model_path)