In [None]:
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForMaskedLM, AutoTokenizer
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict


**Continual pretraining**

In [None]:
# try domain related data with different sizes
with open('/content/drive/MyDrive/dsir/out_dir_300m 100ksample/german_drama_100k_letters.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()
texts = raw_text.split('\\n')

In [None]:
texts[:5]

['"Ferah Ulucay, die 23-jährige Generalsekretärin des Zentralrats, begrüsste mich freundlich und duzte mich. Ich fragte sie, wie sie eigentlich zum Zentralrat gekommen sei. Ich stamme aus einer kurdischen Familie, sagte sie. Ich bin in Bern aufgewachsen, völlig frei. Aber etwas stimmte für mich nicht. Alkohol trinken, Partys, Männer, das alles erschien mir als falsch. So fand ich zur Religion. Der Zentralrat war die einzige Anlaufstelle, als ich Hilfe suchte, nachdem es wegen der Konversion zum Islam zum kurzzeitigen Konflikt mit meinen Eltern kam.',
 '""Was ist das?, will er wissen. Insgeheim hofft er wohl auf ein Ründchen Poker. Die legt Karten., sage ich. Versteht er nicht. Also erkläre ich: Sie kann dir die Zukunft voraussagen. Wann du den ersten Kuss bekommst, wann du deine große Liebe triffst Das ist dem jungen Herrn Peppinello peinlich. Außerdem unterbricht die Kartenlegerin mich ziemlich ungehalten. Nein. Also solchen Quatsch machen wir hier nicht. Sie ist böse. Es ist also Ern

In [None]:
# use gbert-large for german literature
tokenizer = BertTokenizer.from_pretrained('deepset/gbert-large')
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100
    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    return inputs, labels

input_ids = torch.tensor(encodings['input_ids'])
inputs, labels = mask_tokens(input_ids, tokenizer)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=4e-5
)

In [None]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = TextDataset(encodings, labels)


In [None]:
model = BertForMaskedLM.from_pretrained('deepset/gbert-large')

Some weights of the model checkpoint at deepset/gbert-large were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [None]:
trainer.train()

  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss
500,0.0824
1000,0.0066
1500,0.0467
2000,0.0286
2500,0.1029
3000,0.0077
3500,0.0027
4000,0.0022
4500,0.0012
5000,0.0026


In [None]:
model.save_pretrained('/content/drive/MyDrive/dsir/gbert-large-continued-100k')
tokenizer.save_pretrained('/content/drive/MyDrive/dsir/gbert-large-continued-100k')

**Downstrem tasks**

In [None]:
dataset1 = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/dsir/continuous training/drama_polarity_filtered.csv'})
dataset2 = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/dsir/continuous training/drama_main_class_filtered.csv'})
dataset3 = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/dsir/continuous training/drama_sub_emo_filtered.csv'})

def data_split(dataset):
    train_test_split = dataset['train'].train_test_split(test_size=0.2)
    return DatasetDict({
        'train': train_test_split['train'],
        'test': train_test_split['test']
    })

dataset1_splited = data_split(dataset1)
dataset2_splited = data_split(dataset2)
dataset3_splited = data_split(dataset3)

print(dataset1_splited)
print(dataset2_splited)
print(dataset3_splited)


In [None]:
# check every type of annotation for each dataset

def get_unique_values(dataset, column_name):
    column_values = dataset['train'][column_name]
    unique_values = set(column_values)

    return unique_values

column1 = get_unique_values(dataset1, 'polarity')
column2 = get_unique_values(dataset2, 'main_emotion_class')
column3 = get_unique_values(dataset3, 'tag_type')

print(column1)
print(column2)
print(column3)

In [None]:
# mapping method for dataset 1
def label_mapping_1(data):
    label_map = {
        'positiv':0,
        'negativ':1,
        'gemischt':2,
        'Emotionale Bewegtheit':3
    }
    data['label'] = label_map[data['polarity']]
    del data['polarity']
    return data

dataset1_splited = dataset1_splited.map(label_mapping_1)

# mapping method for dataset 2
def label_mapping_2(data):
    label_map = {
        'Emotionen der Freude':0,
        'Emotionen der Angst':1,
        'Emotionen der Ablehnung':2,
        'Emotionale Bewegtheit':3,
        'Emotionen des Leids':4,
        'Emotionen der Zuneigung':5
    }
    data['label'] = label_map[data['main_emotion_class']]
    del data['main_emotion_class']
    return data
dataset2_splited = dataset2_splited.map(label_mapping_2)

# mapping method for dataset 3
def label_mapping_3(data):
    label_map = {
        'Verehrung':0,
        'Angst':1,
        'Leid':2,
        'Liebe':3,
        'Freude':4,
        'Emotionale Bewegtheit':5,
        'Mitleid':6,
        'Abscheu':7,
        'Ärger':8,
        'Freundschaft':9,
        'Lust':10,
        'Verzweiflung':11,
        'Schadenfreude':12
    }
    data['label'] = label_map[data['tag_type']]
    del data['tag_type']  # Remove the old label column if not needed
    return data

dataset3_splited = dataset3_splited.map(label_mapping_3)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-large")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset1 = dataset1_splited.map(preprocess_function, batched=True)
tokenized_dataset2 = dataset2_splited.map(preprocess_function, batched=True)
tokenized_dataset3 = dataset3_splited.map(preprocess_function, batched=True)

In [None]:
sample_text = tokenized_dataset1["train"][5]["text"]
tokenized_sample = tokenizer(sample_text)

print("Original Text:", sample_text)
print("Tokenized Text:", tokenized_sample)
print("Decoded Tokens:", tokenizer.decode(tokenized_sample['input_ids']))


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = { 0:'positiv', 1:'negativ', 2:'gemischt', 3:'Emotionale Bewegtheit'}
label2id = {'positiv':0,'negativ':1,'gemischt':2,'Emotionale Bewegtheit':3}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/dsir/gbert-large-continued-100k", num_labels=4, id2label=id2label, label2id=label2id
)

In [None]:
import os
os.environ['HF_API_TOKEN'] = "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"


In [None]:
from huggingface_hub import login

login(token="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")


In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="model_polarity_100k",
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset1["train"],
    eval_dataset=tokenized_dataset1["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.634473,0.752853,0.745654
2,0.711700,0.54391,0.798898,0.799321
3,0.711700,0.640683,0.787092,0.787261
4,0.345400,0.710637,0.811492,0.809504


TrainOutput(global_step=1272, training_loss=0.45350167136522207, metrics={'train_runtime': 742.5429, 'train_samples_per_second': 54.731, 'train_steps_per_second': 1.713, 'total_flos': 9259353999305856.0, 'train_loss': 0.45350167136522207, 'epoch': 4.0})

In [None]:
id2label = { 0:'Emotionen der Freude',
            1:'Emotionen der Angst',
             2:'Emotionen der Ablehnung',
             3:'Emotionale Bewegtheit',
             4:'Emotionen des Leids',
             5:'Emotionen der Zuneigung'}
label2id = {'Emotionen der Freude':0,
        'Emotionen der Angst':1,
        'Emotionen der Ablehnung':2,
        'Emotionale Bewegtheit':3,
        'Emotionen des Leids':4,
        'Emotionen der Zuneigung':5}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/dsir/gbert-large-continued-100k", num_labels=6, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/dsir/gbert-large-continued-100k and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="model_main_class_100k",
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset2["train"],
    eval_dataset=tokenized_dataset2["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
id2label = { 0:'Verehrung',
            1:'Angst',
             2:'Leid',
             3:'Liebe',
             4:'Freude',
             5:'Emotionale Bewegtheit',
             6:'Mitleid',
             7:'Abscheu',
             8:'Ärger',
             9:'Freundschaft',
             10:'Lust',
             11:'Verzweiflung',
             12:'Schadenfreude'}

label2id = {'Verehrung':0,
        'Angst':1,
        'Leid':2,
        'Liebe':3,
        'Freude':4,
        'Emotionale Bewegtheit':5,
        'Mitleid':6,
        'Abscheu':7,
        'Ärger':8,
        'Freundschaft':9,
        'Lust':10,
        'Verzweiflung':11,
        'Schadenfreude':12}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/dsir/gbert-large-continued-100k", num_labels=13, id2label=id2label, label2id=label2id
)

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="model_sub_emo_100k",
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset3["train"],
    eval_dataset=tokenized_dataset3["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()