In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("arbeitszeugnis_daten.xlsx")

In [3]:
df

Unnamed: 0,Satz,Kriterium,Note
0,Sein exzellentes Fachwissen hielt er durch kon...,Fachwissen,1
1,verfügt über ein hervorragendes und auch in an...,Fachwissen,1
2,In kürzester Zeit beherrschte sie die Fertigun...,Fachwissen,1
3,Er besitzt ein umfassendes und detailliertes W...,Fachwissen,1
4,Sein gutes Fachwissen hielt er durch kontinuie...,Fachwissen,2
...,...,...,...
181,Human Resources,Irrelevant,99
182,Frau trat in unser Unternehmen ein,Irrelevant,99
183,Die ist ein führender Hersteller der Kaffeeind...,Irrelevant,99
184,war beschäftigt,Irrelevant,99


In [4]:
df["Kriterium"].nunique()

10

In [5]:
from sklearn.preprocessing import OrdinalEncoder
df["Kriterium"] = OrdinalEncoder().fit_transform(df["Kriterium"].to_numpy().reshape(-1,1)).astype("int")
df["Note"] = OrdinalEncoder().fit_transform(df["Note"].to_numpy().reshape(-1,1)).astype("int")

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming you have a DataFrame named df with columns 'Satz', 'Note', and 'Kriterium'
# You need to replace df with your actual DataFrame

class NLPDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sentence = self.df.iloc[idx]['Satz']
        note = torch.tensor(self.df.iloc[idx]['Note']).long()
        kriterium = torch.tensor(self.df.iloc[idx]['Kriterium']).long()

        encoding = self.tokenizer(sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        sample = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': {
                'note': note,
                'kriterium': kriterium
            }
        }

        return sample

class MultilabelClassifier(nn.Module):
    def __init__(self, n_note, n_kriterium):
        super().__init__()
        self.bert_model = BertModel.from_pretrained('dbmdz/bert-base-german-uncased')

        self.note = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=768, out_features=n_note)
        )
        self.kriterium = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=768, out_features=n_kriterium)
        )

    def forward(self, x):
        bert_output = self.bert_model(x['input_ids'], attention_mask=x['attention_mask'])[0][:, 0, :]

        return {
            'note': self.note(bert_output),
            'kriterium': self.kriterium(bert_output)
        }

# Assuming you have a pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-uncased')

# Tokenize and split the dataset
# Create datasets and data loaders
train_dataset = NLPDataset(df, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Instantiate the model
model = MultilabelClassifier(n_note=7, n_kriterium=10)

# Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def criterion(loss_func, outputs, labels):
    losses = 0
    for key in outputs:
        losses += loss_func(outputs[key], labels[f'{key}'].to(device))
    return losses

def training(model, device, lr_rate, epochs, train_loader, valid_loader=None):
    num_epochs = epochs
    losses = []
    checkpoint_losses = []

    optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
    n_total_steps = len(train_loader)

    loss_func = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            labels = {key: labels[key].to(device) for key in labels}

            outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})

            loss = criterion(loss_func, outputs, labels)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % (int(n_total_steps/1)) == 0:
                checkpoint_loss = torch.tensor(losses).mean().item()
                checkpoint_losses.append(checkpoint_loss)
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {checkpoint_loss:.4f}')

        if valid_loader is not None:
            model.eval()
            with torch.no_grad():
                valid_losses = []

                for batch in valid_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels']
                    labels = {key: labels[key].to(device) for key in labels}

                    outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})

                    valid_loss = criterion(loss_func, outputs, labels)
                    valid_losses.append(valid_loss.item())

                avg_valid_loss = torch.tensor(valid_losses).mean().item()
                print(f'Validation Loss: {avg_valid_loss:.4f}')

    return checkpoint_losses

# Example usage
checkpoint_losses = training(model, device, 0.0001, 10, train_loader)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/247k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Epoch [1/10], Step [93/93], Loss: 3.1468
Epoch [2/10], Step [93/93], Loss: 2.5065
Epoch [3/10], Step [93/93], Loss: 2.2211
Epoch [4/10], Step [93/93], Loss: 2.0664
Epoch [5/10], Step [93/93], Loss: 2.2850


In [None]:
encoding = tokenizer("Banias War sehr gut bei der Aufgabenbearbeitung", truncation=True, padding='max_length', max_length=128, return_tensors='pt')
encoding.to(device="cuda")
pred = model({'input_ids': encoding['input_ids'],'attention_mask': encoding['attention_mask'],})

In [None]:
import torch.nn
softmax = torch.nn.Softmax()

In [None]:
index = softmax(pred)