In [76]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset

model_name = "bert-base-multilingual-cased"  # or another model suitable for Danish
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# load texts and labels
def load_data():

    import os

    data = {
        'human': [],
        'bot': []
    }

    for filename in os.listdir('data/heste-nettet-nyheder'):
        with open('data/heste-nettet-nyheder/' + filename, 'r', encoding='utf-8') as f:
            content = f.read()
            text = '\n'.join(content.split('\n')[2:])
            data['human'].append(text)

    for filename in os.listdir('data/heste-nettet-nyheder-ai/gpt-3.5-turbo/'):
        with open('data/heste-nettet-nyheder-ai/gpt-3.5-turbo/' + filename, 'r', encoding='utf-8') as f:
            content = f.read()
            text = '\n'.join(content.split('\n')[2:])
            data['bot'].append(text)

    my_texts = np.array(data['human'] + data['bot'])
    my_labels = np.array([0]*len(data['human']) + [1]*len(data['bot'])) 
    

    return list(my_texts), my_labels

def preprocess(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return inputs

texts, labels = load_data()

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.inputs = preprocess(texts)

        # dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
        self.input_ids = self.inputs['input_ids']
        self.attention_mask = self.inputs['attention_mask']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]

        return {
            # 'text': text,
            'label': label,
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }


your_texts, your_labels = load_data()
dataset = TextDataset(your_texts, your_labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Assume you're using a GPU for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    loss_total = 0
    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)


        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        #rint(outputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        loss_total += loss.item()
        print(f"Epoch {epoch}, batch {i}/{len(dataloader)}, loss: {loss.item()}")

    print(f"Epoch {epoch} loss: {loss_total}")

# Save the model
model.save_pretrained("models/bert_classifier", save_function=torch.save)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0, batch 0/7, loss: 0.709789514541626
Epoch 0, batch 1/7, loss: 0.7127237319946289
Epoch 0, batch 2/7, loss: 0.6989465355873108
Epoch 0, batch 3/7, loss: 0.6916332840919495
Epoch 0, batch 4/7, loss: 0.6905733942985535
Epoch 0, batch 5/7, loss: 0.6598642468452454
Epoch 0, batch 6/7, loss: 0.7205790877342224
Epoch 0 loss: 4.884109795093536
Epoch 1, batch 0/7, loss: 0.6408108472824097
Epoch 1, batch 1/7, loss: 0.6342605352401733
Epoch 1, batch 2/7, loss: 0.6334906816482544
Epoch 1, batch 3/7, loss: 0.5846535563468933
Epoch 1, batch 4/7, loss: 0.5929581522941589
Epoch 1, batch 5/7, loss: 0.5860046148300171
Epoch 1, batch 6/7, loss: 0.5745774507522583
Epoch 1 loss: 4.246755838394165
Epoch 2, batch 0/7, loss: 0.5046137571334839
Epoch 2, batch 1/7, loss: 0.4994351267814636
Epoch 2, batch 2/7, loss: 0.5142229795455933
Epoch 2, batch 3/7, loss: 0.5040744543075562
Epoch 2, batch 4/7, loss: 0.4356571435928345
Epoch 2, batch 5/7, loss: 0.35930025577545166
Epoch 2, batch 6/7, loss: 0.29508399

In [24]:
# load my model
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("models/bert_classifier")

In [20]:
# test

test_string = "This is a test"

model.eval()    
with torch.no_grad():
    inputs = preprocess(test_string)
    input_ids = inputs['input_ids'].squeeze(1).to(device)
    attention_mask = inputs['attention_mask'].squeeze(1).to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    print(logits)
    print(torch.argmax(logits).item())

tensor([[0.4077, 0.2017]])
0
