# Poem Polarity Detection

In [1]:
import sys
relative_path = "../../"
sys.path.append(relative_path)
import os
from lookup import LookupCreator
from sensepolar.polarity import WordPolarity
from sensepolar.embed.bertEmbed import BERTWordEmbeddings
from sensepolar.embed.albertEmbed import ALBERTWordEmbeddings
from sensepolar.embed.robertaEmbed import RoBERTaWordEmbeddings
from sensepolar.polarDim import PolarDimensions
from sensepolar.oracle.dictionaryapi import Dictionary
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from datasets import load_dataset
from datasets import Dataset as Data
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.utils import io
import re
import torch.nn.utils.rnn as rnn_utils
from sensepolar.plotter import PolarityPlotter
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings


## BERT Word Embeddings

#### Data Fields

    id: index of the example
    verse_text: The text of the poem verse
    label: The sentiment label. Here
        0 = negative
        1 = positive
        2 = no impact
        3 = mixed (both negative and positive)

In [2]:
class PoemSentimentDataset(Dataset):
    def __init__(self, verse_text, labels, tokenizer, max_length):
        self.verse_text = verse_text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.verse_text)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.verse_text[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }


In [3]:
class BERTEmbeddingClassifier(nn.Module):
    def __init__(self, num_classes, bert_model_name='bert-base-uncased'):
        super(BERTEmbeddingClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        word_embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        output = self.dropout(word_embeddings[:, 0, :])  # Use the [CLS] token representation
        logits = self.fc(output)
        return logits

    def train_model(self, train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device):
        best_valid_loss = float('inf')
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            self.train()
            train_loss = 0.0
            for batch in train_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                optimizer.zero_grad()

                outputs = self(input_ids, attention_mask=attention_mask)
                logits = outputs
                loss = loss_fn(logits, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            avg_train_loss = train_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {avg_train_loss}")

            # Validation
            self.eval()
            valid_loss = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    outputs = self(input_ids, attention_mask=attention_mask)
                    logits = outputs
                    loss = loss_fn(logits, labels)

                    valid_loss += loss.item()

            avg_valid_loss = valid_loss / len(valid_loader)
            print(f"Epoch {epoch+1}/{num_epochs} - Validation Loss: {avg_valid_loss}")

            if avg_valid_loss < best_valid_loss:
                best_valid_loss = avg_valid_loss
                epochs_without_improvement = 0
                torch.save(self.state_dict(), "model/trained_model.pth")
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print(f"Early stopping. No improvement in {patience} epochs.")
                    break

        self.load_state_dict(torch.load("model/trained_model.pth"))

    def test_model(self, test_loader, loss_fn, device):
        self.eval()
        test_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = self(input_ids, attention_mask=attention_mask)
                logits = outputs
                loss = loss_fn(logits, labels)
                test_loss += loss.item()

                _, predicted = torch.max(logits, dim=1)
                correct_predictions += (predicted == labels).sum().item()
                total_samples += labels.size(0)

        avg_test_loss = test_loss / len(test_loader)
        accuracy = correct_predictions / total_samples
        print(f"Test Loss: {avg_test_loss}, Accuracy: {accuracy}")

    def predict(self, input_text, tokenizer, max_length, device):
        inputs = tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        self.eval()
        with torch.no_grad():
            outputs = self(input_ids, attention_mask=attention_mask)
            logits = outputs

        _, predicted = torch.max(logits, dim=1)
        return predicted.item()


## SensePOLAR Embeddings

In [4]:
class PoemSentimentDataset(Dataset):
    def __init__(self, verse_text, labels, word_polarity_model, method='cls', dimension=39):
        self.verse_text = verse_text
        self.labels = labels
        self.word_polarity_model = word_polarity_model
        self.polar_embeddings_cache = {}
        self.dimension = dimension
        self.method = method

    def __len__(self):
        return len(self.verse_text)

    def get_sense_polar_embedding(self, word, context):
        if (word, context) not in self.polar_embeddings_cache:
            with io.capture_output() as captured:
                polar_embedding = self.word_polarity_model.analyze_word(word, context)
            antonym_dict = {}
            for pair in polar_embedding:
                antonym_dict[(pair[0], pair[1])] = antonym_dict.get((pair[0], pair[1]), []) + [pair[2]]
            sorted_antonym_dict = dict(sorted(antonym_dict.items(), key=lambda item: item[0]))
            self.polar_embeddings_cache[(word, context)] = list(sorted_antonym_dict.values())
        return self.polar_embeddings_cache[(word, context)]

    def __getitem__(self, idx):
        verse = self.verse_text[idx]
        labels = self.labels[idx]
        verse_polar_embeddings = None

        if self.method == 'cls':
            verse += ' [CLS]'
            cls_polar_embedding = self.get_sense_polar_embedding('[CLS]', verse)
            verse_polar_embeddings = torch.tensor(cls_polar_embedding, dtype=torch.float)
        else:
            polar_embeddings_list = []
            for word in verse.split():
                polar_embedding = self.get_sense_polar_embedding(word, verse)
                polar_embeddings_list.append(polar_embedding)
            verse_polar_embeddings = torch.tensor(polar_embeddings_list, dtype=torch.float)
            verse_polar_embeddings = torch.mean(verse_polar_embeddings, dim=0)
            
        verse_polar_embeddings = verse_polar_embeddings.long()
        verse_polar_embeddings = verse_polar_embeddings.squeeze(dim=1)
        verse_polar_embeddings = verse_polar_embeddings[:self.dimension]
        label = torch.tensor(labels, dtype=torch.long)

        return {
            'polar_embeddings': verse_polar_embeddings,
            'label': label
        }


In [5]:

class PolarEmbeddingClassifier(nn.Module):
    def __init__(self, num_classes, polar_dimension, model_name='sense_polar_model.pth'):
        super(PolarEmbeddingClassifier, self).__init__()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(polar_dimension, num_classes)
        self.model_name = model_name

    def forward(self, polar_embeddings):
        output = self.dropout(polar_embeddings.float())
        logits = self.fc(output)
        return logits

    def train_model(self, train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device):
        best_valid_loss = float('inf')
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            self.train()
            train_loss = 0.0
            for batch in train_loader:
                polar_embeddings = batch['polar_embeddings'].to(device)
                labels = batch['label'].to(device)

                optimizer.zero_grad()

                outputs = self(polar_embeddings)  
                logits = outputs
                loss = loss_fn(logits, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            avg_train_loss = train_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {avg_train_loss}")

            # Validation
            self.eval()
            valid_loss = 0.0
            with torch.no_grad():
                for batch in valid_loader:
                    polar_embeddings = batch['polar_embeddings'].to(device)  
                    labels = batch['label'].to(device)

                    outputs = self(polar_embeddings)
                    logits = outputs
                    loss = loss_fn(logits, labels)

                    valid_loss += loss.item()

            avg_valid_loss = valid_loss / len(valid_loader)
            print(f"Epoch {epoch+1}/{num_epochs} - Validation Loss: {avg_valid_loss}")

            if avg_valid_loss < best_valid_loss:
                best_valid_loss = avg_valid_loss
                epochs_without_improvement = 0
                torch.save(self.state_dict(),self.model_name)
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print(f"Early stopping. No improvement in {patience} epochs.")
                    break

        self.load_state_dict(torch.load(self.model_name))

    def test_model(self, test_loader, loss_fn, device):
        self.eval()
        test_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        with torch.no_grad():
            for batch in test_loader:
                polar_embeddings = batch['polar_embeddings'].to(device)  # Use 'polar_embeddings' key
                labels = batch['label'].to(device)

                outputs = self(polar_embeddings)  # Pass the polar_embeddings directly to the model
                logits = outputs
                loss = loss_fn(logits, labels)
                test_loss += loss.item()

                _, predicted = torch.max(logits, dim=1)
                correct_predictions += (predicted == labels).sum().item()
                total_samples += labels.size(0)

        avg_test_loss = test_loss / len(test_loader)
        accuracy = correct_predictions / total_samples
        print(f"Test Loss: {avg_test_loss}, Accuracy: {accuracy}")

    def predict(self, polar_embeddings, device):  # Updated predict method to use polar_embeddings directly
        polar_embeddings = polar_embeddings.to(device)
        
        self.eval()
        with torch.no_grad():
            outputs = self(polar_embeddings)
            logits = outputs

        _, predicted = torch.max(logits, dim=1)
        return predicted.item()


### German Version - Sense Polar - 39 Dimensions - CSV


In [17]:
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings
with io.capture_output() as captured:
    # read the german version (translated trough chat gpt) of the poem dataset (Stand 31.08 - 229 examples )

    train_ger_verses = pd.read_csv("train_verses_ger.csv", delimiter=";")
    train_ger_labels = pd.read_csv("train_labels_filtered.csv", delimiter=";")



    #load embedding
    embedding_model_name = 'dbmdz/bert-base-german-cased'
    out_path = '../../antonyms/'
    antonym_path = "data/german_polars.xlsx"
    embed_model = germanBERTWordEmbeddings()
    dictionary = Dictionary('wordnet', api_key='')    
    lookupSpace = LookupCreator(dictionary, out_path, antonyms_file_path=antonym_path)
    lookupSpace.create_lookup_files()
    antonym_path = out_path + "polar_dimensions.pkl"

    pdc = PolarDimensions(embed_model, antonym_path=out_path + "antonym_wordnet_example_sentences_readable_extended.txt")
    pdc.create_polar_dimensions(out_path)

    wp = WordPolarity(embed_model, antonym_path=antonym_path, lookup_path = out_path, method='projection', number_polar=39)
    num_classes = 3
    bert_sensepolar_model = PolarEmbeddingClassifier(num_classes=num_classes, polar_dimension=39, model_name='ger_sense_polar_bert_39dim_cls_projection.pth')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_sensepolar_model.to(device)

optimizer = torch.optim.AdamW(bert_sensepolar_model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 1000
patience = 100

#filtering steps are steped over but gerante lists of verses and labels 
verses_ger_list_train= train_ger_verses["0"].values.tolist()
labels_ger_list_train= train_ger_labels.iloc[:, 0].values.tolist()

#load test and valid
verses_ger_test = pd.read_csv("test_verses_ger.csv",delimiter=";")
verses_ger_valid = pd.read_csv("valid_verses_ger.csv",delimiter=";")

# load test and validation labels
dataset = load_dataset("poem_sentiment")
test_labels = dataset["test"]["label"]
valid_labels = dataset["validation"]["label"]

#convert test and valid to list
verses_ger_list_test= verses_ger_test["0"].values.tolist()
verses_ger_list_valid= verses_ger_valid["0"].values.tolist()



# convert Dataset
train_dataset = PoemSentimentDataset(verses_ger_list_train, labels_ger_list_train, wp)
valid_dataset = PoemSentimentDataset(verses_ger_list_valid, valid_labels, wp)
test_dataset = PoemSentimentDataset(verses_ger_list_test, test_labels, wp)


# implement loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bert_sensepolar_model.train_model(train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device)

bert_sensepolar_model.test_model(test_loader, loss_fn, device)


Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset poem_sentiment (C:/Users/lutz_/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/1000 - Training Loss: 0.9312827283481382
Epoch 1/1000 - Validation Loss: 0.8938835178102765
Epoch 2/1000 - Training Loss: 0.9196937275382707
Epoch 2/1000 - Validation Loss: 0.8775671805654254
Epoch 3/1000 - Training Loss: 0.9209465586914206
Epoch 3/1000 - Validation Loss: 0.8692571691104344
Epoch 4/1000 - Training Loss: 0.8840430311436923
Epoch 4/1000 - Validation Loss: 0.8704676457813808
Epoch 5/1000 - Training Loss: 0.8751147198227217
Epoch 5/1000 - Validation Loss: 0.8632509708404541
Epoch 6/1000 - Training Loss: 0.8685538808129868
Epoch 6/1000 - Validation Loss: 0.862508203302111
Epoch 7/1000 - Training Loss: 0.863811102678191
Epoch 7/1000 - Validation Loss: 0.8578526207378933
Epoch 8/1000 - Training Loss: 0.8736099746992003
Epoch 8/1000 - Validation Loss: 0.8599617992128644
Epoch 9/1000 - Training Loss: 0.850136897473965
Epoch 9/1000 - Validation Loss: 0.8588842494147164
Epoch 10/1000 - Training Loss: 0.8424120905264368
Epoch 10/1000 - Validation Loss: 0.8552177633558001
E

German Version - 39 Dim - Average  

In [8]:
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings
with io.capture_output() as captured:
    # read the german version (translated trough chat gpt) of the poem dataset (Stand 31.08 - 229 examples )

    train_ger_verses = pd.read_csv("train_verses_ger.csv", delimiter=";")
    train_ger_labels = pd.read_csv("train_labels_filtered.csv", delimiter=";")



    #load embedding
    embedding_model_name = 'dbmdz/bert-base-german-cased'
    out_path = '../../antonyms/'
    antonym_path = "data/german_polars.xlsx"
    embed_model = germanBERTWordEmbeddings()
    dictionary = Dictionary('wordnet', api_key='')    
    lookupSpace = LookupCreator(dictionary, out_path, antonyms_file_path=antonym_path)
    lookupSpace.create_lookup_files()
    antonym_path = out_path + "polar_dimensions.pkl"

    pdc = PolarDimensions(embed_model, antonym_path=out_path + "antonym_wordnet_example_sentences_readable_extended.txt")
    pdc.create_polar_dimensions(out_path)

    wp = WordPolarity(embed_model, antonym_path=antonym_path, lookup_path = out_path, method='projection', number_polar=39)
    num_classes = 3
    bert_sensepolar_model = PolarEmbeddingClassifier(num_classes=num_classes, polar_dimension=39, model_name='ger_sense_polar_bert_39dim_avg_projection.pth')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_sensepolar_model.to(device)

optimizer = torch.optim.AdamW(bert_sensepolar_model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
patience = 10

#filtering steps are steped over but gerante lists of verses and labels 
verses_ger_list_train= train_ger_verses["0"].values.tolist()
labels_ger_list_train= train_ger_labels.iloc[:, 0].values.tolist()

#load test and valid
verses_ger_test = pd.read_csv("test_verses_ger.csv",delimiter=";")
verses_ger_valid = pd.read_csv("valid_verses_ger.csv",delimiter=";")

# load test and validation labels
dataset = load_dataset("poem_sentiment")
test_labels = dataset["test"]["label"]
valid_labels = dataset["validation"]["label"]

#convert test and valid to list
verses_ger_list_test= verses_ger_test["0"].values.tolist()
verses_ger_list_valid= verses_ger_valid["0"].values.tolist()



# convert Dataset
train_dataset = PoemSentimentDataset(verses_ger_list_train, labels_ger_list_train, wp, method='avg')
valid_dataset = PoemSentimentDataset(verses_ger_list_valid, valid_labels, wp,method='avg')
test_dataset = PoemSentimentDataset(verses_ger_list_test, test_labels, wp, method='avg')


# implement loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bert_sensepolar_model.train_model(train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device)

bert_sensepolar_model.test_model(test_loader, loss_fn, device)


Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset poem_sentiment (C:/Users/lutz_/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/100 - Training Loss: 1.0215529180922598
Epoch 1/100 - Validation Loss: 0.8996374096189227
Epoch 2/100 - Training Loss: 0.9086135468393002
Epoch 2/100 - Validation Loss: 0.8708148683820452
Epoch 3/100 - Training Loss: 0.8929728618207967
Epoch 3/100 - Validation Loss: 0.8515221050807408
Epoch 4/100 - Training Loss: 0.8702404870177215
Epoch 4/100 - Validation Loss: 0.8398891176496234
Epoch 5/100 - Training Loss: 0.8631434755505256
Epoch 5/100 - Validation Loss: 0.8331514937537057
Epoch 6/100 - Training Loss: 0.8510783123520186
Epoch 6/100 - Validation Loss: 0.8242776989936829
Epoch 7/100 - Training Loss: 0.8390806733437304
Epoch 7/100 - Validation Loss: 0.814966584954943
Epoch 8/100 - Training Loss: 0.8442715024048427
Epoch 8/100 - Validation Loss: 0.8129899416651044
Epoch 9/100 - Training Loss: 0.8403228903716465
Epoch 9/100 - Validation Loss: 0.8065118278775897
Epoch 10/100 - Training Loss: 0.8410136047399269
Epoch 10/100 - Validation Loss: 0.80425506404468
Epoch 11/100 - Traini

German Version - 15 Dimension

In [13]:
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings
with io.capture_output() as captured:
    # read the german version (translated trough chat gpt) of the poem dataset (Stand 31.08 - 229 examples )

    train_ger_verses = pd.read_csv("train_verses_ger.csv", delimiter=";")
    train_ger_labels = pd.read_csv("train_labels_filtered.csv", delimiter=";")



    #load embedding
    embedding_model_name = 'dbmdz/bert-base-german-cased'
    out_path = '../../antonyms/'
    antonym_path = "data/german_polars_15dims.xlsx"
    embed_model = germanBERTWordEmbeddings()
    #glaube wordnet ist egal da wir ein file haben für die antonyme
    dictionary = Dictionary('wordnet', api_key='')    
    lookupSpace = LookupCreator(dictionary, out_path, antonyms_file_path=antonym_path)
    lookupSpace.create_lookup_files()
    antonym_path = out_path + "polar_dimensions.pkl"

    pdc = PolarDimensions(embed_model, antonym_path=out_path + "antonym_wordnet_example_sentences_readable_extended.txt")
    pdc.create_polar_dimensions(out_path)

    wp = WordPolarity(embed_model, antonym_path=antonym_path, lookup_path = out_path, method='projection', number_polar=15)
    num_classes = 3
    bert_sensepolar_model = PolarEmbeddingClassifier(num_classes=num_classes, polar_dimension=15, model_name='ger_sense_polar_bert_15dim_cls_projection.pth')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_sensepolar_model.to(device)

optimizer = torch.optim.AdamW(bert_sensepolar_model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
patience = 10

#filtering steps are steped over but gerante lists of verses and labels 
verses_ger_list_train= train_ger_verses["0"].values.tolist()
labels_ger_list_train= train_ger_labels.iloc[:, 0].values.tolist()

#load test and valid
verses_ger_test = pd.read_csv("test_verses_ger.csv",delimiter=";")
verses_ger_valid = pd.read_csv("valid_verses_ger.csv",delimiter=";")

# load test and validation labels
dataset = load_dataset("poem_sentiment")
test_labels = dataset["test"]["label"]
valid_labels = dataset["validation"]["label"]

#convert test and valid to list
verses_ger_list_test= verses_ger_test["0"].values.tolist()
verses_ger_list_valid= verses_ger_valid["0"].values.tolist()



# convert Dataset
train_dataset = PoemSentimentDataset(verses_ger_list_train, labels_ger_list_train, wp, method='cls')
valid_dataset = PoemSentimentDataset(verses_ger_list_valid, valid_labels, wp,method='cls')
test_dataset = PoemSentimentDataset(verses_ger_list_test, test_labels, wp, method='cls')


# implement loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bert_sensepolar_model.train_model(train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device)

bert_sensepolar_model.test_model(test_loader, loss_fn, device)


Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset poem_sentiment (C:/Users/lutz_/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/100 - Training Loss: 0.9545971953643942
Epoch 1/100 - Validation Loss: 0.9087414826665606
Epoch 2/100 - Training Loss: 0.9162907690372107
Epoch 2/100 - Validation Loss: 0.8979678068842206
Epoch 3/100 - Training Loss: 0.9123636056792062
Epoch 3/100 - Validation Loss: 0.8890242917197091
Epoch 4/100 - Training Loss: 0.8944039918341726
Epoch 4/100 - Validation Loss: 0.8848914589200702
Epoch 5/100 - Training Loss: 0.8809020890379852
Epoch 5/100 - Validation Loss: 0.8808356778962272
Epoch 6/100 - Training Loss: 0.8832201136733001
Epoch 6/100 - Validation Loss: 0.8761213081223624
Epoch 7/100 - Training Loss: 0.8784824182402413
Epoch 7/100 - Validation Loss: 0.8744376472064427
Epoch 8/100 - Training Loss: 0.8725158246058338
Epoch 8/100 - Validation Loss: 0.8738378797258649
Epoch 9/100 - Training Loss: 0.862284633348573
Epoch 9/100 - Validation Loss: 0.8728514058249337
Epoch 10/100 - Training Loss: 0.8588480162170699
Epoch 10/100 - Validation Loss: 0.8695210473878043
Epoch 11/100 - Trai

German Version - 15 dims - avg


In [14]:
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings
with io.capture_output() as captured:
    # read the german version (translated trough chat gpt) of the poem dataset (Stand 31.08 - 229 examples )

    train_ger_verses = pd.read_csv("train_verses_ger.csv", delimiter=";")
    train_ger_labels = pd.read_csv("train_labels_filtered.csv", delimiter=";")



    #load embedding
    embedding_model_name = 'dbmdz/bert-base-german-cased'
    out_path = '../../antonyms/'
    antonym_path = "data/german_polars_15dims.xlsx"
    embed_model = germanBERTWordEmbeddings()
    #glaube wordnet ist egal da wir ein file haben für die antonyme
    dictionary = Dictionary('wordnet', api_key='')    
    lookupSpace = LookupCreator(dictionary, out_path, antonyms_file_path=antonym_path)
    lookupSpace.create_lookup_files()
    antonym_path = out_path + "polar_dimensions.pkl"

    pdc = PolarDimensions(embed_model, antonym_path=out_path + "antonym_wordnet_example_sentences_readable_extended.txt")
    pdc.create_polar_dimensions(out_path)

    wp = WordPolarity(embed_model, antonym_path=antonym_path, lookup_path = out_path, method='projection', number_polar=15)
    num_classes = 3
    bert_sensepolar_model = PolarEmbeddingClassifier(num_classes=num_classes, polar_dimension=15, model_name='ger_sense_polar_bert_15dim_avg_projection.pth')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_sensepolar_model.to(device)

optimizer = torch.optim.AdamW(bert_sensepolar_model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
patience = 10

#filtering steps are steped over but gerante lists of verses and labels 
verses_ger_list_train= train_ger_verses["0"].values.tolist()
labels_ger_list_train= train_ger_labels.iloc[:, 0].values.tolist()

#load test and valid
verses_ger_test = pd.read_csv("test_verses_ger.csv",delimiter=";")
verses_ger_valid = pd.read_csv("valid_verses_ger.csv",delimiter=";")

# load test and validation labels
dataset = load_dataset("poem_sentiment")
test_labels = dataset["test"]["label"]
valid_labels = dataset["validation"]["label"]

#convert test and valid to list
verses_ger_list_test= verses_ger_test["0"].values.tolist()
verses_ger_list_valid= verses_ger_valid["0"].values.tolist()



# convert Dataset
train_dataset = PoemSentimentDataset(verses_ger_list_train, labels_ger_list_train, wp, method='avg')
valid_dataset = PoemSentimentDataset(verses_ger_list_valid, valid_labels, wp,method='avg')
test_dataset = PoemSentimentDataset(verses_ger_list_test, test_labels, wp, method='avg')


# implement loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bert_sensepolar_model.train_model(train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device)

bert_sensepolar_model.test_model(test_loader, loss_fn, device)


Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset poem_sentiment (C:/Users/lutz_/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/100 - Training Loss: 1.0371379278740793
Epoch 1/100 - Validation Loss: 0.8933061276163373
Epoch 2/100 - Training Loss: 0.9035688357533149
Epoch 2/100 - Validation Loss: 0.8537760632378715
Epoch 3/100 - Training Loss: 0.8894723080239206
Epoch 3/100 - Validation Loss: 0.846392103603908
Epoch 4/100 - Training Loss: 0.8913198956903422
Epoch 4/100 - Validation Loss: 0.8389690773827689
Epoch 5/100 - Training Loss: 0.8798298430892656
Epoch 5/100 - Validation Loss: 0.832646403993879
Epoch 6/100 - Training Loss: 0.8805316990276553
Epoch 6/100 - Validation Loss: 0.8284701108932495
Epoch 7/100 - Training Loss: 0.8699056500533842
Epoch 7/100 - Validation Loss: 0.8258070605141776
Epoch 8/100 - Training Loss: 0.8677100286168872
Epoch 8/100 - Validation Loss: 0.8222150802612305
Epoch 9/100 - Training Loss: 0.8575974871527474
Epoch 9/100 - Validation Loss: 0.8189340574400765
Epoch 10/100 - Training Loss: 0.8582630843486426
Epoch 10/100 - Validation Loss: 0.8186080285481044
Epoch 11/100 - Train

German Version - Last 15 Dims - cls

In [6]:
from sensepolar.embed.germanBertEmbed import germanBERTWordEmbeddings
with io.capture_output() as captured:
    # read the german version (translated trough chat gpt) of the poem dataset (Stand 31.08 - 229 examples )

    train_ger_verses = pd.read_csv("train_verses_ger.csv", delimiter=";")
    train_ger_labels = pd.read_csv("train_labels_filtered.csv", delimiter=";")



    #load embedding
    embedding_model_name = 'dbmdz/bert-base-german-cased'
    out_path = '../../antonyms/'
    antonym_path = "data/german_polars_last_15dims.xlsx"
    embed_model = germanBERTWordEmbeddings()
    #glaube wordnet ist egal da wir ein file haben für die antonyme
    dictionary = Dictionary('wordnet', api_key='')    
    lookupSpace = LookupCreator(dictionary, out_path, antonyms_file_path=antonym_path)
    lookupSpace.create_lookup_files()
    antonym_path = out_path + "polar_dimensions.pkl"

    pdc = PolarDimensions(embed_model, antonym_path=out_path + "antonym_wordnet_example_sentences_readable_extended.txt")
    pdc.create_polar_dimensions(out_path)

    wp = WordPolarity(embed_model, antonym_path=antonym_path, lookup_path = out_path, method='projection', number_polar=15)
    num_classes = 3
    bert_sensepolar_model = PolarEmbeddingClassifier(num_classes=num_classes, polar_dimension=15, model_name='ger_sense_polar_bert_last15dim_avg_projection.pth')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_sensepolar_model.to(device)

optimizer = torch.optim.AdamW(bert_sensepolar_model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 100
patience = 10

#filtering steps are steped over but gerante lists of verses and labels 
verses_ger_list_train= train_ger_verses["0"].values.tolist()
labels_ger_list_train= train_ger_labels.iloc[:, 0].values.tolist()

#load test and valid
verses_ger_test = pd.read_csv("test_verses_ger.csv",delimiter=";")
verses_ger_valid = pd.read_csv("valid_verses_ger.csv",delimiter=";")

# load test and validation labels
dataset = load_dataset("poem_sentiment")
test_labels = dataset["test"]["label"]
valid_labels = dataset["validation"]["label"]

#convert test and valid to list
verses_ger_list_test= verses_ger_test["0"].values.tolist()
verses_ger_list_valid= verses_ger_valid["0"].values.tolist()



# convert Dataset
train_dataset = PoemSentimentDataset(verses_ger_list_train, labels_ger_list_train, wp, method='cls')
valid_dataset = PoemSentimentDataset(verses_ger_list_valid, valid_labels, wp,method='cls')
test_dataset = PoemSentimentDataset(verses_ger_list_test, test_labels, wp, method='cls')


# implement loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bert_sensepolar_model.train_model(train_loader, valid_loader, num_epochs, patience, optimizer, loss_fn, device)

bert_sensepolar_model.test_model(test_loader, loss_fn, device)


Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset poem_sentiment (C:/Users/lutz_/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/100 - Training Loss: 1.0052479685477491
Epoch 1/100 - Validation Loss: 0.9782097254480634
Epoch 2/100 - Training Loss: 0.9705969106476262
Epoch 2/100 - Validation Loss: 0.9563803161893573
Epoch 3/100 - Training Loss: 0.966230349720649
Epoch 3/100 - Validation Loss: 0.9409305027553013
Epoch 4/100 - Training Loss: 0.9541416449366875
Epoch 4/100 - Validation Loss: 0.9292872548103333
Epoch 5/100 - Training Loss: 0.931018869831877
Epoch 5/100 - Validation Loss: 0.9212596671921867
Epoch 6/100 - Training Loss: 0.9211642342918324
Epoch 6/100 - Validation Loss: 0.9096358588763646
Epoch 7/100 - Training Loss: 0.91378338719314
Epoch 7/100 - Validation Loss: 0.905099264213017
Epoch 8/100 - Training Loss: 0.90543998187443
Epoch 8/100 - Validation Loss: 0.896925151348114
Epoch 9/100 - Training Loss: 0.9061348382032143
Epoch 9/100 - Validation Loss: 0.8913575240543911
Epoch 10/100 - Training Loss: 0.8978178309944441
Epoch 10/100 - Validation Loss: 0.8882027864456177
Epoch 11/100 - Training Lo

Data Analysis - Plots 