# Apply BERT and 20-Fold Cross-Validation

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Quadro RTX 8000


In [2]:
#!pip install transformers

## 1. Readand prepare data

In [3]:
import pandas as pd
df = pd.read_excel('Crowdanno_Datenbereinigung_done.xlsx')
df.head()

Unnamed: 0,id_beitrag (mb),id_mb,komm_id (mf),id_mf,Kommentar_id_rtl,id_rtl,comment_id_zon,id_zon,id_crowd,c_text,...,Tatsache_total,unangemessen_total_median,unangemessen_edulow_median,unangemessen_edumed_median,unangemessen_eduhigh_median,bereichernd_total_median,bereichernd_edulow_median,bereichernd_edumed_median,bereichernd_eduhigh_median,Tatsache_total_median
0,831.0,1.0,,,,,,,1,"Tolle Idee. Ich denke, dass dieses Projekt Tei...",...,0.111111,0,0,0,0,1,1,1,1,0
1,841.0,3.0,,,,,,,2,Wohnungstausch sollte auch in belegungsgebunde...,...,0.111111,0,0,0,0,1,0,1,0,0
2,843.0,4.0,,,,,,,3,Lebensbedingungen vor Ort könnten sogar geziel...,...,0.333333,0,0,0,0,1,1,1,1,0
3,850.0,5.0,,,,,,,4,"Super Ideen, da kommt Freude auf mitzumachen! ...",...,0.111111,0,0,0,0,1,1,1,1,0
4,852.0,6.0,,,,,,,5,Wenn der Staat schon Steuermittel ausgeben wil...,...,0.777778,0,0,0,0,1,1,1,1,1


In [36]:
#list(df)

In [4]:
#Get median/majorty vote for class Mixed
df["bereichernd_mixed_median"] = df[['bereichernd_1', "bereichernd_4", "bereichernd_7"]].median(axis=1)
df["bereichernd_mixed_median"].value_counts()

bereichernd_mixed_median
0.0    7117
1.0    6560
Name: count, dtype: int64

In [5]:
df['bereichernd_mixed_median'] = df.bereichernd_mixed_median.astype(int)

In [6]:
df["bereichernd_mixed_median"].value_counts()

bereichernd_mixed_median
0    7117
1    6560
Name: count, dtype: int64

In [7]:
df.shape

(13677, 68)

In [8]:
df.dropna(subset="c_text", inplace=True)

In [9]:
len(df)

13674

In [10]:
df.head()

Unnamed: 0,id_beitrag (mb),id_mb,komm_id (mf),id_mf,Kommentar_id_rtl,id_rtl,comment_id_zon,id_zon,id_crowd,c_text,...,unangemessen_total_median,unangemessen_edulow_median,unangemessen_edumed_median,unangemessen_eduhigh_median,bereichernd_total_median,bereichernd_edulow_median,bereichernd_edumed_median,bereichernd_eduhigh_median,Tatsache_total_median,bereichernd_mixed_median
0,831.0,1.0,,,,,,,1,"Tolle Idee. Ich denke, dass dieses Projekt Tei...",...,0,0,0,0,1,1,1,1,0,1
1,841.0,3.0,,,,,,,2,Wohnungstausch sollte auch in belegungsgebunde...,...,0,0,0,0,1,0,1,0,0,1
2,843.0,4.0,,,,,,,3,Lebensbedingungen vor Ort könnten sogar geziel...,...,0,0,0,0,1,1,1,1,0,1
3,850.0,5.0,,,,,,,4,"Super Ideen, da kommt Freude auf mitzumachen! ...",...,0,0,0,0,1,1,1,1,0,1
4,852.0,6.0,,,,,,,5,Wenn der Staat schon Steuermittel ausgeben wil...,...,0,0,0,0,1,1,1,1,1,1


## 2. Transform text for BERT

In [11]:
sentences = df.c_text.values

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased", use_fast = False, do_lower_case=True)

In [13]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print("Legth:", len(input_ids[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors


Original:  Tolle Idee. Ich denke, dass dieses Projekt Teil des Stadtforums werden sollte, damit wir darüber weiter nachdenken können.
Token IDs: [102, 11524, 4407, 552, 260, 5327, 806, 347, 971, 2354, 582, 210, 649, 30515, 318, 1370, 806, 865, 228, 449, 10360, 30940, 490, 18226, 367, 316, 552, 103]
Legth: 28


In [14]:
df["sequence_legth"] = [len(sen) for sen in input_ids]

In [15]:
#import matplotlib.pyplot as plt
#plt.bar(df.index, df["sequence_legth"], width=20)

In [16]:
import statistics
print('Max sentence length: ', max([len(sen) for sen in input_ids]))
print('Min sentence length: ', min([len(sen) for sen in input_ids]))
print('Mean sentence length: ', statistics.mean([len(sen) for sen in input_ids]))
print('Median sentence length: ', statistics.median([len(sen) for sen in input_ids]))

Max sentence length:  3546
Min sentence length:  3
Mean sentence length:  77.44771098434987
Median sentence length:  44.0


## 3. Run Models

### 3.0 Initialize BERT

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

In [18]:
# Initialize BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased", use_fast = False, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-german-uncased", # deepset ai
    num_labels = 2, # The number of output labels, which is 2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Define a custom PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label)}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 3.1 Low Education

#### 3.1.1 Low on low

In [85]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edulow_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)


Training Fold 1/20


KeyboardInterrupt: 

In [18]:
#create new df
df_low_low = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_low_low.loc['mean'] = df_low_low.mean()
df_low_low

Unnamed: 0,accuracy,f1,recall,precision
0,0.75,0.740516,0.77707,0.707246
1,0.878655,0.873282,0.910828,0.83871
2,0.944444,0.942771,0.996815,0.894286
3,0.988304,0.987342,0.993631,0.981132
4,0.997076,0.996825,1.0,0.993671
5,0.98538,0.984227,0.993631,0.975
6,0.994152,0.99361,0.990446,0.996795
7,0.994152,0.99361,0.990446,0.996795
8,0.997076,0.996805,0.993631,1.0
9,0.995614,0.995246,1.0,0.990536


In [19]:
df_low_low.to_excel('df_low_low.xlsx', index=None)

#### 3.1.2 Low on Medium

In [20]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edulow_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7426900584795322
Training Fold 2/20
Accuracy for Fold 2: 0.7397660818713451
Training Fold 3/20
Accuracy for Fold 3: 0.7368421052631579
Training Fold 4/20
Accuracy for Fold 4: 0.7587719298245614
Training Fold 5/20
Accuracy for Fold 5: 0.7543859649122807
Training Fold 6/20
Accuracy for Fold 6: 0.7631578947368421
Training Fold 7/20
Accuracy for Fold 7: 0.7631578947368421
Training Fold 8/20
Accuracy for Fold 8: 0.7353801169590644
Training Fold 9/20
Accuracy for Fold 9: 0.7426900584795322
Training Fold 10/20
Accuracy for Fold 10: 0.75
Training Fold 11/20
Accuracy for Fold 11: 0.7134502923976608
Training Fold 12/20
Accuracy for Fold 12: 0.7353801169590644
Training Fold 13/20
Accuracy for Fold 13: 0.7529239766081871
Training Fold 14/20
Accuracy for Fold 14: 0.7616959064327485
Training Fold 15/20
Accuracy for Fold 15: 0.7642752562225475
Training Fold 16/20
Accuracy for Fold 16: 0.7628111273792094
Training Fold 17/20
Accuracy for Fold 17: 0.754026354319

In [21]:
fold_accuracies

[0.7426900584795322,
 0.7397660818713451,
 0.7368421052631579,
 0.7587719298245614,
 0.7543859649122807,
 0.7631578947368421,
 0.7631578947368421,
 0.7353801169590644,
 0.7426900584795322,
 0.75,
 0.7134502923976608,
 0.7353801169590644,
 0.7529239766081871,
 0.7616959064327485,
 0.7642752562225475,
 0.7628111273792094,
 0.7540263543191801,
 0.767203513909224,
 0.7496339677891655,
 0.7481698389458272]

In [22]:
#create new df
df_low_med = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })


In [23]:
#Add row with mean
df_low_med.loc['mean'] = df_low_med.mean()

In [24]:
df_low_med

Unnamed: 0,accuracy,f1,recall,precision
0,0.74269,0.730887,0.705015,0.75873
1,0.739766,0.731118,0.705539,0.758621
2,0.736842,0.731343,0.688202,0.780255
3,0.758772,0.754098,0.706704,0.808307
4,0.754386,0.749254,0.70904,0.794304
5,0.763158,0.752294,0.740964,0.763975
6,0.763158,0.756024,0.717143,0.799363
7,0.73538,0.721966,0.701493,0.743671
8,0.74269,0.733333,0.699422,0.770701
9,0.75,0.742857,0.707736,0.781646


In [25]:
df_low_med.to_excel('df_low_med.xlsx', index=None)

#### 3.1.3 Low on high

In [26]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edulow_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7280701754385965
Training Fold 2/20
Accuracy for Fold 2: 0.7178362573099415
Training Fold 3/20
Accuracy for Fold 3: 0.7602339181286549
Training Fold 4/20
Accuracy for Fold 4: 0.7821637426900585
Training Fold 5/20
Accuracy for Fold 5: 0.7514619883040936
Training Fold 6/20
Accuracy for Fold 6: 0.7412280701754386
Training Fold 7/20
Accuracy for Fold 7: 0.7456140350877193
Training Fold 8/20
Accuracy for Fold 8: 0.7441520467836257
Training Fold 9/20
Accuracy for Fold 9: 0.72953216374269
Training Fold 10/20
Accuracy for Fold 10: 0.7456140350877193
Training Fold 11/20
Accuracy for Fold 11: 0.7573099415204678
Training Fold 12/20
Accuracy for Fold 12: 0.7397660818713451
Training Fold 13/20
Accuracy for Fold 13: 0.7558479532163743
Training Fold 14/20
Accuracy for Fold 14: 0.7675438596491229
Training Fold 15/20
Accuracy for Fold 15: 0.7247437774524158
Training Fold 16/20
Accuracy for Fold 16: 0.7437774524158126
Training Fold 17/20
Accuracy for Fold 17: 0.

In [28]:
#create new df
df_low_high = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_low_high.loc['mean'] = df_low_high.mean()

In [29]:
df_low_high

Unnamed: 0,accuracy,f1,recall,precision
0,0.72807,0.704762,0.702532,0.707006
1,0.717836,0.696063,0.686335,0.70607
2,0.760234,0.74613,0.728097,0.765079
3,0.782164,0.766823,0.751534,0.782748
4,0.751462,0.736842,0.719033,0.755556
5,0.741228,0.724728,0.703927,0.746795
6,0.745614,0.732308,0.710448,0.755556
7,0.744152,0.726989,0.710366,0.744409
8,0.729532,0.714946,0.692537,0.738854
9,0.745614,0.731481,0.713855,0.75


In [30]:
df_low_high.to_excel('df_low_high.xlsx', index=None)

#### 3.1.4 Low on mixed

In [38]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edulow_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.77046783625731
Training Fold 2/20
Accuracy for Fold 2: 0.7850877192982456
Training Fold 3/20
Accuracy for Fold 3: 0.7748538011695907
Training Fold 4/20
Accuracy for Fold 4: 0.8216374269005848
Training Fold 5/20
Accuracy for Fold 5: 0.7880116959064327
Training Fold 6/20
Accuracy for Fold 6: 0.7953216374269005
Training Fold 7/20
Accuracy for Fold 7: 0.8084795321637427
Training Fold 8/20
Accuracy for Fold 8: 0.7967836257309941
Training Fold 9/20
Accuracy for Fold 9: 0.7909356725146199
Training Fold 10/20
Accuracy for Fold 10: 0.7894736842105263
Training Fold 11/20
Accuracy for Fold 11: 0.8230994152046783
Training Fold 12/20
Accuracy for Fold 12: 0.7923976608187134
Training Fold 13/20
Accuracy for Fold 13: 0.8216374269005848
Training Fold 14/20
Accuracy for Fold 14: 0.8201754385964912
Training Fold 15/20
Accuracy for Fold 15: 0.7642752562225475
Training Fold 16/20
Accuracy for Fold 16: 0.7906295754026355
Training Fold 17/20
Accuracy for Fold 17: 0.

In [39]:
#create new df
df_low_mix = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_low_mix.loc['mean'] = df_low_mix.mean()

In [40]:
df_low_mix

Unnamed: 0,accuracy,f1,recall,precision
0,0.770468,0.755832,0.736364,0.776358
1,0.785088,0.772798,0.750751,0.796178
2,0.774854,0.76161,0.738739,0.785942
3,0.821637,0.809969,0.792683,0.828025
4,0.788012,0.777948,0.749263,0.808917
5,0.795322,0.781931,0.765244,0.799363
6,0.80848,0.797527,0.772455,0.824281
7,0.796784,0.779014,0.777778,0.780255
8,0.790936,0.779661,0.755224,0.805732
9,0.789474,0.780488,0.748538,0.815287


In [41]:
df_low_mix.to_excel('df_low_mix.xlsx', index=None)

## 3.2 Education Medium

#### 3.2.1  Medium on low

In [42]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edumed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.868421052631579
Training Fold 2/20
Accuracy for Fold 2: 0.7850877192982456
Training Fold 3/20
Accuracy for Fold 3: 0.7324561403508771
Training Fold 4/20
Accuracy for Fold 4: 0.75
Training Fold 5/20
Accuracy for Fold 5: 0.7953216374269005
Training Fold 6/20
Accuracy for Fold 6: 0.7353801169590644
Training Fold 7/20
Accuracy for Fold 7: 0.7763157894736842
Training Fold 8/20
Accuracy for Fold 8: 0.7909356725146199
Training Fold 9/20
Accuracy for Fold 9: 0.7339181286549707
Training Fold 10/20
Accuracy for Fold 10: 0.75
Training Fold 11/20
Accuracy for Fold 11: 0.7192982456140351
Training Fold 12/20
Accuracy for Fold 12: 0.7675438596491229
Training Fold 13/20
Accuracy for Fold 13: 0.7573099415204678
Training Fold 14/20
Accuracy for Fold 14: 0.7514619883040936
Training Fold 15/20
Accuracy for Fold 15: 0.7306002928257687
Training Fold 16/20
Accuracy for Fold 16: 0.7335285505124451
Training Fold 17/20
Accuracy for Fold 17: 0.7291361639824304
Training F

In [43]:
#create new df
df_med_low = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_med_low.loc['mean'] = df_med_low.mean()

In [44]:
df_med_low

Unnamed: 0,accuracy,f1,recall,precision
0,0.868421,0.865672,0.917722,0.819209
1,0.785088,0.778281,0.837662,0.726761
2,0.732456,0.731278,0.787975,0.682192
3,0.75,0.758133,0.795252,0.724324
4,0.795322,0.789157,0.845161,0.740113
5,0.73538,0.720247,0.774086,0.67341
6,0.776316,0.77533,0.783383,0.767442
7,0.790936,0.783661,0.817035,0.752907
8,0.733918,0.727545,0.747692,0.708455
9,0.75,0.738931,0.778135,0.703488


In [45]:
df_med_low.to_excel('df_med_low.xlsx', index=None)

#### 3.2.2  Medium on medium

In [46]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edumed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.9897660818713451
Training Fold 2/20
Accuracy for Fold 2: 0.9970760233918129
Training Fold 3/20
Accuracy for Fold 3: 0.922514619883041
Training Fold 4/20
Accuracy for Fold 4: 0.8640350877192983
Training Fold 5/20
Accuracy for Fold 5: 0.9897660818713451
Training Fold 6/20
Accuracy for Fold 6: 0.9649122807017544
Training Fold 7/20
Accuracy for Fold 7: 0.9809941520467836
Training Fold 8/20
Accuracy for Fold 8: 0.9985380116959064
Training Fold 9/20
Accuracy for Fold 9: 0.9941520467836257
Training Fold 10/20
Accuracy for Fold 10: 0.6871345029239766
Training Fold 11/20
Accuracy for Fold 11: 0.9283625730994152
Training Fold 12/20
Accuracy for Fold 12: 0.9722222222222222
Training Fold 13/20
Accuracy for Fold 13: 0.9678362573099415
Training Fold 14/20
Accuracy for Fold 14: 0.9853801169590644
Training Fold 15/20
Accuracy for Fold 15: 0.972181551976574
Training Fold 16/20
Accuracy for Fold 16: 0.9809663250366032
Training Fold 17/20
Accuracy for Fold 17: 0.

In [47]:
#create new df
df_med_med = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_med_med.loc['mean'] = df_med_med.mean()

In [48]:
df_med_med

Unnamed: 0,accuracy,f1,recall,precision
0,0.989766,0.989899,0.994203,0.985632
1,0.997076,0.997101,0.997101,0.997101
2,0.922515,0.928475,0.997101,0.868687
3,0.864035,0.880616,0.997093,0.788506
4,0.989766,0.989811,0.988372,0.991254
5,0.964912,0.963964,0.93314,0.996894
6,0.980994,0.981077,0.979651,0.982507
7,0.998538,0.998549,1.0,0.997101
8,0.994152,0.99422,1.0,0.988506
9,0.687135,0.762222,0.997093,0.616906


In [49]:
df_med_med.to_excel('df_med_med.xlsx', index=None)

#### 3.2.3  Medium on high

In [50]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edumed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7909356725146199
Training Fold 2/20
Accuracy for Fold 2: 0.7763157894736842
Training Fold 3/20
Accuracy for Fold 3: 0.7485380116959064
Training Fold 4/20
Accuracy for Fold 4: 0.7690058479532164
Training Fold 5/20
Accuracy for Fold 5: 0.7777777777777778
Training Fold 6/20
Accuracy for Fold 6: 0.814327485380117
Training Fold 7/20
Accuracy for Fold 7: 0.7733918128654971
Training Fold 8/20
Accuracy for Fold 8: 0.8026315789473685
Training Fold 9/20
Accuracy for Fold 9: 0.7894736842105263
Training Fold 10/20
Accuracy for Fold 10: 0.7777777777777778
Training Fold 11/20
Accuracy for Fold 11: 0.7777777777777778
Training Fold 12/20
Accuracy for Fold 12: 0.7529239766081871
Training Fold 13/20
Accuracy for Fold 13: 0.7660818713450293
Training Fold 14/20
Accuracy for Fold 14: 0.804093567251462
Training Fold 15/20
Accuracy for Fold 15: 0.7759882869692533
Training Fold 16/20
Accuracy for Fold 16: 0.7657393850658858
Training Fold 17/20
Accuracy for Fold 17: 0.

In [51]:
#create new df
df_med_high = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_med_high.loc['mean'] = df_med_high.mean()

In [52]:
df_med_high

Unnamed: 0,accuracy,f1,recall,precision
0,0.790936,0.787519,0.820433,0.757143
1,0.776316,0.773333,0.815625,0.735211
2,0.748538,0.737003,0.779935,0.698551
3,0.769006,0.763473,0.806962,0.724432
4,0.777778,0.772455,0.801242,0.745665
5,0.814327,0.809023,0.851266,0.770774
6,0.773392,0.769001,0.796296,0.743516
7,0.802632,0.791988,0.769461,0.815873
8,0.789474,0.787611,0.801802,0.773913
9,0.777778,0.782857,0.813056,0.754821


In [53]:
df_med_high.to_excel('df_med_high.xlsx', index=None)

#### 3.2.4 Medium on mixed

In [54]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_edumed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.8289473684210527
Training Fold 2/20
Accuracy for Fold 2: 0.8304093567251462
Training Fold 3/20
Accuracy for Fold 3: 0.8026315789473685
Training Fold 4/20
Accuracy for Fold 4: 0.7997076023391813
Training Fold 5/20
Accuracy for Fold 5: 0.4853801169590643
Training Fold 6/20
Accuracy for Fold 6: 0.8260233918128655
Training Fold 7/20
Accuracy for Fold 7: 0.8099415204678363
Training Fold 8/20
Accuracy for Fold 8: 0.8464912280701754
Training Fold 9/20
Accuracy for Fold 9: 0.804093567251462
Training Fold 10/20
Accuracy for Fold 10: 0.8070175438596491
Training Fold 11/20
Accuracy for Fold 11: 0.8070175438596491
Training Fold 12/20
Accuracy for Fold 12: 0.8230994152046783
Training Fold 13/20
Accuracy for Fold 13: 0.8230994152046783
Training Fold 14/20
Accuracy for Fold 14: 0.814327485380117
Training Fold 15/20
Accuracy for Fold 15: 0.7964860907759883
Training Fold 16/20
Accuracy for Fold 16: 0.7964860907759883
Training Fold 17/20
Accuracy for Fold 17: 0.

In [55]:
#create new df
df_med_mix = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_med_mix.loc['mean'] = df_med_mix.mean()

In [56]:
df_med_mix

Unnamed: 0,accuracy,f1,recall,precision
0,0.828947,0.82406,0.858934,0.791908
1,0.830409,0.827893,0.85061,0.806358
2,0.802632,0.798808,0.819572,0.77907
3,0.799708,0.797037,0.805389,0.788856
4,0.48538,0.65286,1.0,0.484627
5,0.826023,0.821589,0.864353,0.782857
6,0.809942,0.811047,0.835329,0.788136
7,0.846491,0.846715,0.85044,0.843023
8,0.804094,0.798193,0.812883,0.784024
9,0.807018,0.810888,0.815562,0.806268


In [57]:
df_med_mix.to_excel('df_med_mix.xlsx', index=None)

### 3.3 Education High

#### 3.3.1 High on low

In [58]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Low in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_eduhigh_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7748538011695907
Training Fold 2/20
Accuracy for Fold 2: 0.8523391812865497
Training Fold 3/20
Accuracy for Fold 3: 0.6564327485380117
Training Fold 4/20
Accuracy for Fold 4: 0.8187134502923976
Training Fold 5/20
Accuracy for Fold 5: 0.8011695906432749
Training Fold 6/20
Accuracy for Fold 6: 0.8201754385964912
Training Fold 7/20
Accuracy for Fold 7: 0.8157894736842105
Training Fold 8/20
Accuracy for Fold 8: 0.7880116959064327
Training Fold 9/20
Accuracy for Fold 9: 0.7909356725146199
Training Fold 10/20
Accuracy for Fold 10: 0.77046783625731
Training Fold 11/20
Accuracy for Fold 11: 0.7558479532163743
Training Fold 12/20
Accuracy for Fold 12: 0.5248538011695907
Training Fold 13/20


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for Fold 13: 0.7733918128654971
Training Fold 14/20
Accuracy for Fold 14: 0.7807017543859649
Training Fold 15/20
Accuracy for Fold 15: 0.746705710102489
Training Fold 16/20
Accuracy for Fold 16: 0.780380673499268
Training Fold 17/20
Accuracy for Fold 17: 0.7642752562225475
Training Fold 18/20
Accuracy for Fold 18: 0.7657393850658858
Training Fold 19/20
Accuracy for Fold 19: 0.7598828696925329
Training Fold 20/20
Accuracy for Fold 20: 0.7569546120058566


In [59]:
#create new df
df_high_low = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_high_low.loc['mean'] = df_high_low.mean()

In [60]:
df_high_low

Unnamed: 0,accuracy,f1,recall,precision
0,0.774854,0.754777,0.779605,0.731481
1,0.852339,0.818018,0.782759,0.856604
2,0.656433,0.430993,0.281646,0.917526
3,0.818713,0.80625,0.834951,0.779456
4,0.80117,0.793313,0.798165,0.78852
5,0.820175,0.816692,0.864353,0.774011
6,0.815789,0.813056,0.840491,0.787356
7,0.788012,0.776579,0.812903,0.743363
8,0.790936,0.777605,0.819672,0.739645
9,0.770468,0.766716,0.826923,0.714681


In [61]:
df_high_low.to_excel('df_high_low.xlsx', index=None)

#### 3.3.2 High on medium

In [62]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Medium in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_eduhigh_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7997076023391813
Training Fold 2/20
Accuracy for Fold 2: 0.814327485380117
Training Fold 3/20
Accuracy for Fold 3: 0.7982456140350878
Training Fold 4/20
Accuracy for Fold 4: 0.7850877192982456
Training Fold 5/20
Accuracy for Fold 5: 0.8172514619883041
Training Fold 6/20
Accuracy for Fold 6: 0.7997076023391813
Training Fold 7/20
Accuracy for Fold 7: 0.8099415204678363
Training Fold 8/20
Accuracy for Fold 8: 0.7456140350877193
Training Fold 9/20
Accuracy for Fold 9: 0.7880116959064327
Training Fold 10/20
Accuracy for Fold 10: 0.7821637426900585
Training Fold 11/20
Accuracy for Fold 11: 0.7690058479532164
Training Fold 12/20
Accuracy for Fold 12: 0.7909356725146199
Training Fold 13/20
Accuracy for Fold 13: 0.7821637426900585
Training Fold 14/20
Accuracy for Fold 14: 0.7880116959064327
Training Fold 15/20
Accuracy for Fold 15: 0.7525622254758418
Training Fold 16/20
Accuracy for Fold 16: 0.7730600292825769
Training Fold 17/20
Accuracy for Fold 17: 0

In [63]:
#create new df
df_high_med = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_high_med.loc['mean'] = df_high_med.mean()

In [64]:
df_high_med

Unnamed: 0,accuracy,f1,recall,precision
0,0.799708,0.801161,0.793103,0.809384
1,0.814327,0.814056,0.794286,0.834835
2,0.798246,0.786378,0.751479,0.824675
3,0.785088,0.780269,0.711172,0.864238
4,0.817251,0.813711,0.793605,0.834862
5,0.799708,0.8,0.787356,0.813056
6,0.809942,0.80826,0.776204,0.843077
7,0.745614,0.728972,0.670487,0.798635
8,0.788012,0.785185,0.768116,0.80303
9,0.782164,0.781845,0.758523,0.806647


In [65]:
df_high_med.to_excel('df_high_med.xlsx', index=None)

#### 3.3.3 High on high

In [66]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_eduhigh_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.9766081871345029
Training Fold 2/20
Accuracy for Fold 2: 0.5350877192982456
Training Fold 3/20
Accuracy for Fold 3: 0.9619883040935673
Training Fold 4/20
Accuracy for Fold 4: 0.9926900584795322
Training Fold 5/20
Accuracy for Fold 5: 0.9181286549707602
Training Fold 6/20
Accuracy for Fold 6: 0.9824561403508771
Training Fold 7/20
Accuracy for Fold 7: 0.9883040935672515
Training Fold 8/20
Accuracy for Fold 8: 0.9941520467836257
Training Fold 9/20
Accuracy for Fold 9: 0.9912280701754386
Training Fold 10/20
Accuracy for Fold 10: 0.9897660818713451
Training Fold 11/20
Accuracy for Fold 11: 0.9941520467836257
Training Fold 12/20
Accuracy for Fold 12: 0.4766081871345029
Training Fold 13/20
Accuracy for Fold 13: 0.9956140350877193
Training Fold 14/20
Accuracy for Fold 14: 0.9985380116959064
Training Fold 15/20
Accuracy for Fold 15: 0.9692532942898975
Training Fold 16/20
Accuracy for Fold 16: 0.9736456808199122
Training Fold 17/20
Accuracy for Fold 17: 

In [67]:
#create new df
df_high_high = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_high_high.loc['mean'] = df_high_high.mean()

In [68]:
df_high_high

Unnamed: 0,accuracy,f1,recall,precision
0,0.976608,0.97546,0.978462,0.972477
1,0.535088,0.042169,0.021538,1.0
2,0.961988,0.960123,0.963077,0.957187
3,0.99269,0.99232,0.993846,0.990798
4,0.918129,0.906667,0.836923,0.989091
5,0.982456,0.981481,0.978462,0.98452
6,0.988304,0.987616,0.981538,0.993769
7,0.994152,0.993827,0.990769,0.996904
8,0.991228,0.990769,0.990769,0.990769
9,0.989766,0.989214,0.987692,0.990741


In [69]:
df_high_high.to_excel('df_high_high.xlsx', index=None)

#### 3.3.4 High on mixed

In [70]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_eduhigh_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.8187134502923976
Training Fold 2/20
Accuracy for Fold 2: 0.8201754385964912
Training Fold 3/20
Accuracy for Fold 3: 0.8377192982456141
Training Fold 4/20
Accuracy for Fold 4: 0.8362573099415205
Training Fold 5/20
Accuracy for Fold 5: 0.8464912280701754
Training Fold 6/20
Accuracy for Fold 6: 0.8391812865497076
Training Fold 7/20
Accuracy for Fold 7: 0.814327485380117
Training Fold 8/20
Accuracy for Fold 8: 0.7821637426900585
Training Fold 9/20
Accuracy for Fold 9: 0.7616959064327485
Training Fold 10/20
Accuracy for Fold 10: 0.8084795321637427
Training Fold 11/20
Accuracy for Fold 11: 0.5
Training Fold 12/20


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for Fold 12: 0.7997076023391813
Training Fold 13/20
Accuracy for Fold 13: 0.8026315789473685
Training Fold 14/20
Accuracy for Fold 14: 0.8362573099415205
Training Fold 15/20
Accuracy for Fold 15: 0.7862371888726208
Training Fold 16/20
Accuracy for Fold 16: 0.5402635431918009
Training Fold 17/20
Accuracy for Fold 17: 0.8330893118594437
Training Fold 18/20
Accuracy for Fold 18: 0.8316251830161054
Training Fold 19/20
Accuracy for Fold 19: 0.8301610541727672
Training Fold 20/20
Accuracy for Fold 20: 0.8125915080527086


In [71]:
#create new df
df_high_mix = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_high_mix.loc['mean'] = df_high_mix.mean()

In [72]:
df_high_mix

Unnamed: 0,accuracy,f1,recall,precision
0,0.818713,0.815476,0.81791,0.813056
1,0.820175,0.809302,0.825949,0.793313
2,0.837719,0.832073,0.820896,0.843558
3,0.836257,0.833333,0.802292,0.866873
4,0.846491,0.836703,0.82263,0.851266
5,0.839181,0.830247,0.832817,0.827692
6,0.814327,0.81296,0.838906,0.788571
7,0.782164,0.774584,0.764179,0.785276
8,0.761696,0.76681,0.834891,0.708995
9,0.80848,0.799387,0.805556,0.793313


In [73]:
df_high_mix.to_excel('df_high_mix.xlsx', index=None)

### 3.4 Education Mixed

#### 3.4.1 Mixed on low

In [20]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edulow_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_mixed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7733918128654971
Training Fold 2/20
Accuracy for Fold 2: 0.7792397660818714
Training Fold 3/20
Accuracy for Fold 3: 0.8026315789473685
Training Fold 4/20
Accuracy for Fold 4: 0.8128654970760234
Training Fold 5/20
Accuracy for Fold 5: 0.8084795321637427
Training Fold 6/20
Accuracy for Fold 6: 0.7850877192982456
Training Fold 7/20
Accuracy for Fold 7: 0.7646198830409356
Training Fold 8/20
Accuracy for Fold 8: 0.8172514619883041
Training Fold 9/20
Accuracy for Fold 9: 0.7953216374269005
Training Fold 10/20
Accuracy for Fold 10: 0.8070175438596491
Training Fold 11/20
Accuracy for Fold 11: 0.8114035087719298
Training Fold 12/20
Accuracy for Fold 12: 0.783625730994152
Training Fold 13/20
Accuracy for Fold 13: 0.7997076023391813
Training Fold 14/20
Accuracy for Fold 14: 0.7894736842105263
Training Fold 15/20
Accuracy for Fold 15: 0.8096632503660323
Training Fold 16/20
Accuracy for Fold 16: 0.7979502196193266
Training Fold 17/20
Accuracy for Fold 17: 0

In [21]:
#create new df
df_mix_low = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_mix_low.loc['mean'] = df_mix_low.mean()

In [22]:
df_mix_low

Unnamed: 0,accuracy,f1,recall,precision
0,0.773392,0.769688,0.814465,0.729577
1,0.77924,0.778917,0.83125,0.732782
2,0.802632,0.789392,0.834983,0.748521
3,0.812865,0.800623,0.815873,0.785933
4,0.80848,0.799387,0.805556,0.793313
5,0.785088,0.776256,0.789474,0.763473
6,0.76462,0.752688,0.758514,0.746951
7,0.817251,0.805599,0.827476,0.784848
8,0.795322,0.779874,0.810458,0.751515
9,0.807018,0.797546,0.81761,0.778443


In [23]:
df_mix_low.to_excel('df_high_low.xlsx', index=None)

#### 3.4.2 Mixed on medium

In [24]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_edumed_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_mixed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.8011695906432749
Training Fold 2/20
Accuracy for Fold 2: 0.8084795321637427
Training Fold 3/20
Accuracy for Fold 3: 0.8084795321637427
Training Fold 4/20
Accuracy for Fold 4: 0.8084795321637427
Training Fold 5/20
Accuracy for Fold 5: 0.8347953216374269
Training Fold 6/20
Accuracy for Fold 6: 0.8245614035087719
Training Fold 7/20
Accuracy for Fold 7: 0.8084795321637427
Training Fold 8/20
Accuracy for Fold 8: 0.8318713450292398
Training Fold 9/20
Accuracy for Fold 9: 0.8260233918128655
Training Fold 10/20
Accuracy for Fold 10: 0.8099415204678363
Training Fold 11/20
Accuracy for Fold 11: 0.8391812865497076
Training Fold 12/20
Accuracy for Fold 12: 0.8099415204678363
Training Fold 13/20
Accuracy for Fold 13: 0.8128654970760234
Training Fold 14/20
Accuracy for Fold 14: 0.8216374269005848
Training Fold 15/20
Accuracy for Fold 15: 0.8052708638360175
Training Fold 16/20
Accuracy for Fold 16: 0.8257686676427526
Training Fold 17/20
Accuracy for Fold 17: 

In [25]:
#create new df
df_mix_med = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_mix_med.loc['mean'] = df_mix_med.mean()

In [26]:
df_mix_med

Unnamed: 0,accuracy,f1,recall,precision
0,0.80117,0.803468,0.759563,0.852761
1,0.80848,0.807069,0.782857,0.832827
2,0.80848,0.803008,0.789941,0.816514
3,0.80848,0.805926,0.783862,0.829268
4,0.834795,0.831091,0.815249,0.847561
5,0.824561,0.825581,0.788889,0.865854
6,0.80848,0.810967,0.76776,0.859327
7,0.831871,0.825493,0.824242,0.826748
8,0.826023,0.823704,0.801153,0.847561
9,0.809942,0.804805,0.795252,0.81459


In [27]:
df_mix_med.to_excel('df_mix_med.xlsx', index=None)

#### 3.4.3 Mixed on high

In [28]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_eduhigh_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_mixed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.7982456140350878
Training Fold 2/20
Accuracy for Fold 2: 0.8187134502923976
Training Fold 3/20
Accuracy for Fold 3: 0.8114035087719298
Training Fold 4/20
Accuracy for Fold 4: 0.8157894736842105
Training Fold 5/20
Accuracy for Fold 5: 0.8114035087719298
Training Fold 6/20
Accuracy for Fold 6: 0.7967836257309941
Training Fold 7/20
Accuracy for Fold 7: 0.8157894736842105
Training Fold 8/20
Accuracy for Fold 8: 0.8494152046783626
Training Fold 9/20
Accuracy for Fold 9: 0.8260233918128655
Training Fold 10/20
Accuracy for Fold 10: 0.8245614035087719
Training Fold 11/20
Accuracy for Fold 11: 0.8406432748538012
Training Fold 12/20
Accuracy for Fold 12: 0.8289473684210527
Training Fold 13/20
Accuracy for Fold 13: 0.804093567251462
Training Fold 14/20
Accuracy for Fold 14: 0.8333333333333334
Training Fold 15/20
Accuracy for Fold 15: 0.7994143484626647
Training Fold 16/20
Accuracy for Fold 16: 0.8404099560761347
Training Fold 17/20
Accuracy for Fold 17: 0

In [29]:
#create new df
df_mix_high = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_mix_high.loc['mean'] = df_mix_high.mean()

In [30]:
df_mix_high

Unnamed: 0,accuracy,f1,recall,precision
0,0.798246,0.795252,0.772334,0.819572
1,0.818713,0.80625,0.826923,0.786585
2,0.811404,0.801233,0.807453,0.795107
3,0.815789,0.806154,0.813665,0.79878
4,0.811404,0.803053,0.799392,0.806748
5,0.796784,0.782473,0.801282,0.764526
6,0.815789,0.810241,0.79822,0.82263
7,0.849415,0.846954,0.830904,0.863636
8,0.826023,0.817204,0.820988,0.813456
9,0.824561,0.815951,0.823529,0.808511


In [31]:
df_mix_high.to_excel('df_mix_high.xlsx', index=None)

#### 3.4.4. Mixed on mixed

In [32]:
# Convert dataframe to dataset
dataset_train = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #High in train set
dataset_test = TextDataset(df['c_text'].tolist(), df['bereichernd_mixed_median'].tolist()) #Mixed in test set


# Define k-fold cross-validation
k_folds = 20
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
fold_accuracies = []
fold_f1s = []
fold_recalls = []
fold_precisions = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['c_text'], df['bereichernd_mixed_median'])):
    print(f"Training Fold {fold+1}/{k_folds}")

    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset_train, train_indices)
    val_dataset = torch.utils.data.Subset(dataset_test, val_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(3):  # Adjust the number of epochs as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


    fold_f1 = f1_score(val_labels, val_predictions)
    fold_f1s.append(fold_f1)

    fold_recall = recall_score(val_labels, val_predictions)
    fold_recalls.append(fold_recall)

    fold_precision = precision_score(val_labels, val_predictions)
    fold_precisions.append(fold_precision)

Training Fold 1/20
Accuracy for Fold 1: 0.9941520467836257
Training Fold 2/20
Accuracy for Fold 2: 1.0
Training Fold 3/20
Accuracy for Fold 3: 0.9985380116959064
Training Fold 4/20
Accuracy for Fold 4: 0.9985380116959064
Training Fold 5/20
Accuracy for Fold 5: 0.9985380116959064
Training Fold 6/20
Accuracy for Fold 6: 0.9985380116959064
Training Fold 7/20
Accuracy for Fold 7: 0.9956140350877193
Training Fold 8/20
Accuracy for Fold 8: 1.0
Training Fold 9/20
Accuracy for Fold 9: 0.9985380116959064
Training Fold 10/20
Accuracy for Fold 10: 0.9941520467836257
Training Fold 11/20
Accuracy for Fold 11: 0.9985380116959064
Training Fold 12/20
Accuracy for Fold 12: 0.9897660818713451
Training Fold 13/20
Accuracy for Fold 13: 0.9970760233918129
Training Fold 14/20
Accuracy for Fold 14: 1.0
Training Fold 15/20
Accuracy for Fold 15: 0.9941434846266471
Training Fold 16/20
Accuracy for Fold 16: 0.9970717423133236
Training Fold 17/20
Accuracy for Fold 17: 0.9956076134699854
Training Fold 18/20
Accura

In [33]:
#create new df
df_mix_mix = pd.DataFrame({'accuracy':fold_accuracies,
                                        'f1':fold_f1s,
                                        'recall':fold_recalls,
                                        'precision':fold_precisions
                                        })
#Add row with mean
df_mix_mix.loc['mean'] = df_mix_mix.mean()

In [34]:
df_mix_mix

Unnamed: 0,accuracy,f1,recall,precision
0,0.994152,0.993865,0.987805,1.0
1,1.0,1.0,1.0,1.0
2,0.998538,0.998473,0.996951,1.0
3,0.998538,0.998473,0.996951,1.0
4,0.998538,0.998478,1.0,0.99696
5,0.998538,0.998478,1.0,0.99696
6,0.995614,0.99542,0.993902,0.996942
7,1.0,1.0,1.0,1.0
8,0.998538,0.998478,1.0,0.99696
9,0.994152,0.993884,0.990854,0.996933


In [35]:
df_mix_mix.to_excel('df_mix_mix.xlsx', index=None)

# Fine