In [None]:
import sys
!{sys.executable} -m pip install python-docx
!{sys.executable} -m pip install --upgrade transformers[torch] datasets evaluate sacremoses
!{sys.executable} -m pip install --upgrade mlxtend wandb

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl 

In [None]:
import docx
import os, re
from collections import Counter

# Data loading code

Counter(labels)


Counter({0: 168, 1: 195})

In [None]:

from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(labels, labels)
dummy_clf.score(labels, labels)

0.5371900826446281

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import numpy as np
import wandb, gc, os, pickle, datasets, transformers, torch, evaluate

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from datasets import concatenate_datasets, Dataset
from transformers import (
    BertConfig,
    AutoTokenizer,
    BertModel,
    BertTokenizerFast,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments)

transformers.logging.set_verbosity_error()

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def preprocess_function(examples, tokenizer):
    return tokenizer.batch_encode_plus(examples["text"], is_split_into_words=False, truncation=True, max_length=512, padding='max_length', pad_to_max_length=True)

model_name = "allegro/herbert-large-cased"
runtime_name = "herbert-large-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length", max_length=512, truncation=True, do_lower_case=False)


dataset_dict = {'label': labels, 'text': texts}
dataset_hf = Dataset.from_dict(dataset_dict)
dataset_hf = dataset_hf.map(lambda p: preprocess_function(p, tokenizer), batched=True)

labels = np.array(labels)
texts = np.array(texts)

true_labels = [] # ground truth
predictions = [] # model predictions

batch_size = 4
fold_nr=0
all_results = []
device = "cuda"

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(texts):

  print(f'Fold_nr= {fold_nr}') ; fold_nr += 1

  model_config = BertConfig.from_pretrained(model_name, num_labels=2)
  model = BertForSequenceClassification.from_pretrained(model_name, config=model_config)

  training_args = TrainingArguments(
      do_eval=True,
      output_dir='./results-'+runtime_name,          # output directory
      overwrite_output_dir=True,
      num_train_epochs=10,              # total number of training epochs
      per_device_train_batch_size=batch_size,   # 8 @A100 batch size per device during training
      per_device_eval_batch_size=batch_size,    # 8 @A100 batch size for evaluation
      warmup_steps=120,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=80,
      evaluation_strategy="steps",     # Evaluation is done at the end of each epoch, alternatives: "steps", "epoch"
      save_strategy="steps",           # Save is done at the end of each epoch, alternatives: "steps", "epoch"
      # save_steps=500                 # Number of updates steps before two checkpoint saves if save_strategy="steps",
      learning_rate=1e-5,
      eval_steps = 25,
      save_total_limit = 1,
      load_best_model_at_end=True,     # Whether or not to load the best model found during training at the end of training.
      metric_for_best_model='accuracy',
      disable_tqdm=False,
      run_name = runtime_name
  )

  train_dataset = dataset_hf.select(train_index)
  test_dataset = dataset_hf.select(test_index)

  # let's have validation set from 5% of train:
  train_dataset = train_dataset.train_test_split(test_size=0.05, shuffle=True)

  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length', max_length=512, return_tensors='pt')

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset['train'],  # train part of train
      eval_dataset=train_dataset['test'],  # this is validation
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  trainer.train()

  predictions_obj = trainer.predict(test_dataset)
  y_pred = np.argmax(predictions_obj.predictions, axis=1)
  test_dataset = test_dataset.add_column("y_pred", y_pred)

  predictions.append( y_pred )
  true_labels.append( labels[test_index] )


print( classification_report(np.concatenate(true_labels).ravel().tolist(), np.concatenate(predictions).ravel().tolist() ) )


tokenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/556k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Map:   0%|          | 0/363 [00:00<?, ? examples/s]

Fold_nr= 0


pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33ma-wawer[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.545178,0.823529,0.903226,0.823529,1.0
50,No log,0.614302,0.823529,0.903226,0.823529,1.0
75,No log,0.771157,0.294118,0.25,1.0,0.142857
100,0.692100,0.455249,0.823529,0.903226,0.823529,1.0
125,0.692100,0.415899,0.823529,0.903226,0.823529,1.0
150,0.692100,0.313811,0.882353,0.923077,1.0,0.857143
175,0.629500,0.318814,0.882353,0.923077,1.0,0.857143
200,0.629500,0.443366,0.882353,0.923077,1.0,0.857143
225,0.629500,0.104423,0.941176,0.962963,1.0,0.928571
250,0.390300,0.188176,0.941176,0.962963,1.0,0.928571


Flattening the indices:   0%|          | 0/37 [00:00<?, ? examples/s]

Fold_nr= 1


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.668824,0.705882,0.0,0.0,0.0
50,No log,0.738623,0.470588,0.4,0.272727,0.75
75,No log,0.667322,0.705882,0.285714,0.333333,0.25
100,0.684000,0.683309,0.647059,0.25,0.25,0.25
125,0.684000,0.957998,0.470588,0.470588,0.307692,1.0
150,0.684000,0.802223,0.647059,0.25,0.25,0.25
175,0.582600,0.485525,0.823529,0.4,1.0,0.25
200,0.582600,0.424789,0.823529,0.4,1.0,0.25
225,0.582600,0.053893,1.0,1.0,1.0,1.0
250,0.313100,0.074592,0.941176,0.888889,0.8,1.0


Flattening the indices:   0%|          | 0/37 [00:00<?, ? examples/s]

Fold_nr= 2


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.681881,0.529412,0.5,0.4,0.666667
50,No log,0.66741,0.588235,0.533333,0.444444,0.666667
75,No log,0.630785,0.647059,0.571429,0.5,0.666667
100,0.685000,0.443241,0.823529,0.727273,0.8,0.666667
125,0.685000,0.442859,0.882353,0.8,1.0,0.666667
150,0.685000,1.00877,0.470588,0.571429,0.4,1.0
175,0.531700,0.456484,0.823529,0.8,0.666667,1.0
200,0.531700,0.321601,0.882353,0.857143,0.75,1.0
225,0.531700,1.009136,0.764706,0.75,0.6,1.0
250,0.331300,0.86867,0.823529,0.8,0.666667,1.0


Flattening the indices:   0%|          | 0/37 [00:00<?, ? examples/s]

Fold_nr= 3


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.716021,0.529412,0.636364,0.466667,1.0
50,No log,0.695007,0.529412,0.0,0.0,0.0
75,No log,0.691404,0.470588,0.571429,0.428571,0.857143
100,0.682400,0.606657,0.705882,0.545455,0.75,0.428571
125,0.682400,0.752663,0.470588,0.608696,0.4375,1.0
150,0.682400,0.593064,0.647059,0.4,0.666667,0.285714
175,0.657200,0.504937,0.823529,0.769231,0.833333,0.714286
200,0.657200,0.754098,0.823529,0.769231,0.833333,0.714286
225,0.657200,0.673974,0.882353,0.857143,0.857143,0.857143
250,0.333500,1.018111,0.823529,0.769231,0.833333,0.714286


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 4


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.749649,0.352941,0.47619,0.333333,0.833333
50,No log,0.787137,0.235294,0.380952,0.266667,0.666667
75,No log,0.674629,0.588235,0.0,0.0,0.0
100,0.687400,0.724018,0.529412,0.0,0.0,0.0
125,0.687400,0.671975,0.647059,0.4,0.5,0.333333
150,0.687400,0.646157,0.705882,0.545455,0.6,0.5
175,0.558800,0.782584,0.705882,0.545455,0.6,0.5
200,0.558800,1.234056,0.647059,0.625,0.5,0.833333
225,0.558800,1.124497,0.823529,0.769231,0.714286,0.833333
250,0.286900,1.188079,0.823529,0.769231,0.714286,0.833333


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 5


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.715896,0.411765,0.285714,0.25,0.333333
50,No log,0.763028,0.352941,0.521739,0.352941,1.0
75,No log,0.635011,0.588235,0.0,0.0,0.0
100,0.688800,0.573592,0.705882,0.444444,0.666667,0.333333
125,0.688800,0.543003,0.764706,0.666667,0.666667,0.666667
150,0.688800,0.473876,0.823529,0.769231,0.714286,0.833333
175,0.580400,0.467499,0.764706,0.666667,0.666667,0.666667
200,0.580400,0.429517,0.823529,0.727273,0.8,0.666667
225,0.580400,0.848013,0.823529,0.666667,1.0,0.5
250,0.392200,0.981905,0.823529,0.666667,1.0,0.5


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 6


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.680445,0.470588,0.526316,0.384615,0.833333
50,No log,0.708755,0.411765,0.545455,0.375,1.0
75,No log,0.602702,0.764706,0.5,1.0,0.333333
100,0.696100,0.592735,0.705882,0.705882,0.545455,1.0
125,0.696100,0.47765,0.941176,0.923077,0.857143,1.0
150,0.696100,0.360655,0.882353,0.833333,0.833333,0.833333
175,0.564000,0.485483,0.823529,0.8,0.666667,1.0
200,0.564000,0.245543,0.823529,0.666667,1.0,0.5
225,0.564000,0.305544,0.882353,0.857143,0.75,1.0
250,0.277400,0.060605,1.0,1.0,1.0,1.0


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 7


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.715385,0.294118,0.25,0.2,0.333333
50,No log,0.862528,0.352941,0.521739,0.352941,1.0
75,No log,0.658088,0.705882,0.285714,1.0,0.166667
100,0.690900,0.738069,0.352941,0.521739,0.352941,1.0
125,0.690900,0.574842,0.823529,0.769231,0.714286,0.833333
150,0.690900,0.462916,0.882353,0.8,1.0,0.666667
175,0.613400,0.44303,0.764706,0.75,0.6,1.0
200,0.613400,0.16016,0.941176,0.909091,1.0,0.833333
225,0.613400,0.234192,0.941176,0.909091,1.0,0.833333
250,0.292600,0.242855,0.941176,0.909091,1.0,0.833333


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 8


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.674652,0.470588,0.4,0.375,0.428571
50,No log,0.730422,0.411765,0.583333,0.411765,1.0
75,No log,0.652125,0.588235,0.222222,0.5,0.142857
100,0.695000,0.649274,0.588235,0.363636,0.5,0.285714
125,0.695000,0.590067,0.647059,0.4,0.666667,0.285714
150,0.695000,0.541164,0.823529,0.8,0.75,0.857143
175,0.586300,0.68656,0.705882,0.615385,0.666667,0.571429
200,0.586300,0.830058,0.823529,0.823529,0.7,1.0
225,0.586300,0.809035,0.823529,0.8,0.75,0.857143
250,0.311000,1.2322,0.764706,0.714286,0.714286,0.714286


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

Fold_nr= 9


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.69957,0.470588,0.181818,0.25,0.142857
50,No log,0.737436,0.411765,0.583333,0.411765,1.0
75,No log,0.689328,0.529412,0.0,0.0,0.0
100,0.677400,0.640742,0.647059,0.571429,0.571429,0.571429
125,0.677400,0.692626,0.529412,0.5,0.444444,0.571429
150,0.677400,0.664166,0.588235,0.666667,0.5,1.0
175,0.719600,0.62188,0.647059,0.5,0.6,0.428571
200,0.719600,0.629887,0.705882,0.736842,0.583333,1.0
225,0.719600,0.561853,0.823529,0.823529,0.7,1.0
250,0.606900,0.657428,0.705882,0.666667,0.625,0.714286


Flattening the indices:   0%|          | 0/36 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       168
           1       0.90      0.86      0.88       195

    accuracy                           0.88       363
   macro avg       0.88      0.88      0.88       363
weighted avg       0.88      0.88      0.88       363

