# Natural Language Processing - lab 7 (Text classification)

Bartosz Klimza

# Necessary imports

In [1]:
!pip install datasets
!pip install fasttext
!pip install transformers
!pip install simpletransformers

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.0 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 36.7 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 37.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 354 kB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 74.8 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86

In [2]:
from datasets import load_dataset
import fasttext
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
import pandas as pd
import transformers
from simpletransformers.classification import ClassificationModel

# 1. Get acquainted with the data of the Polish Cyberbullying detection dataset. Pay special attention to the distribution of the positive and negative examples in the first task as well as distribution of the classes in the second task.

In [3]:
def show_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average=None)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    mcc = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)

    print(f'F1: {f1}\nF1 macro: {macro_f1}\nF1 micro: {micro_f1}\nMCC: {mcc}\nAccuracy: {acc}')

In [4]:
dataset_1 = load_dataset("poleval2019_cyberbullying", "task01")
dataset_2 = load_dataset("poleval2019_cyberbullying", "task02")

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading and preparing dataset poleval2019_cyber_bullying/task01 (download: 400.39 KiB, generated: 1.16 MiB, post-processed: Unknown size, total: 1.55 MiB) to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450...


Downloading:   0%|          | 0.00/340k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset poleval2019_cyber_bullying downloaded and prepared to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading and preparing dataset poleval2019_cyber_bullying/task02 (download: 400.53 KiB, generated: 1.16 MiB, post-processed: Unknown size, total: 1.55 MiB) to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450...


Downloading:   0%|          | 0.00/340k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset poleval2019_cyber_bullying downloaded and prepared to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(dataset_1)
print("\n")
print(dataset_2)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10041
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10041
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


In [None]:
examples = {"0":0, "1":0}
for i in dataset_1["train"]:
  if i["label"] == 0:
    examples["0"] += 1
  else:
    examples["1"] += 1

print("Dataset 1")
print("0:", examples["0"])
print("1:", examples["1"])

Dataset 1
0: 9190
1: 851


In [None]:
examples_2 = {"0":0, "1":0, "2":0}
for i in dataset_2["train"]:
  if i["label"] == 0:
    examples_2["0"] += 1
  elif i["label"] == 1:
    examples_2["1"] += 1
  else:
    examples_2["2"] += 1

print("Dataset 2")
print("0:", examples_2["0"])
print("1:", examples_2["1"])
print("2:", examples_2["2"])

Dataset 2
0: 9190
1: 253
2: 598


In [5]:
dataset_1_train_text = []
dataset_1_train_labels = []
for i in dataset_1["train"]:
  dataset_1_train_text.append(i["text"])
  dataset_1_train_labels.append(i["label"])

dataset_1_test_text = []
dataset_1_test_labels = []
for i in dataset_1["test"]:
  dataset_1_test_text.append(i["text"])
  dataset_1_test_labels.append(i["label"])

dataset_2_train_text = []
dataset_2_train_labels = []
for i in dataset_2["train"]:
  dataset_2_train_text.append(i["text"])
  dataset_2_train_labels.append(i["label"])

dataset_2_test_text = []
dataset_2_test_labels = []
for i in dataset_2["test"]:
  dataset_2_test_text.append(i["text"])
  dataset_2_test_labels.append(i["label"])

# 2. Train the following classifiers on the training sets (for the task 1 and the task 2):

* Bayesian classifier with TF * IDF weighting
* Fasttext text classifier
* Transformer classifier

In [15]:
# Fasttext text classifier

with open("fasttext_train_1.txt", "w") as file:
  for i in dataset_1["train"]:
    l, t = i["label"], i["text"]
    file.write(f"__label__{l} {t}\n")

with open("fasttext_train_2.txt", "w") as file:
  for i in dataset_2["train"]:
    l, t = i["label"], i["text"]
    file.write(f"__label__{l} {t}\n")


fasttext_model_1 = fasttext.train_supervised("fasttext_train_1.txt")
fasttext_model_2 = fasttext.train_supervised("fasttext_train_2.txt")

In [16]:
results_fasttext_1_train = []
for i in dataset_1_train_text:
  res = fasttext_model_1.predict(i.replace("\n", ""))[0][0][-1]
  results_fasttext_1_train.append(int(res))

show_metrics(dataset_1_train_labels, results_fasttext_1_train)

F1: [0.97335339 0.66212534]
F1 macro: 0.8177393652605096
F1 micro: 0.9506025296285231
MCC: 0.6457900385885745
Accuracy: 0.9506025296285231


In [17]:
results_fasttext_2_train = []
for i in dataset_2_train_text:
  res = fasttext_model_2.predict(i.replace("\n", ""))[0][0][-1]
  results_fasttext_2_train.append(int(res))

show_metrics(dataset_2_train_labels, results_fasttext_2_train)

F1: [0.9636536  0.28985507 0.14035088]
F1 macro: 0.4646198508969615
F1 micro: 0.9217209441290708
MCC: 0.3213453250850878
Accuracy: 0.9217209441290708


In [10]:
# Transformer classifier

model = ClassificationModel(
    "roberta",
    "allegro/herbert-base-cased",
    use_cuda=False)

model_2 = ClassificationModel(
    "roberta",
    "allegro/herbert-base-cased",
    use_cuda=False,
    num_labels=3)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing RobertaForSequenceClassification: ['bert.encoder.layer.9.attention.self.query.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.7.output.dense.weight', 'bert.encoder.layer.4.attention.self.key.weight', 'bert.encoder.layer.10.attention.self.value.bias', 'bert.encoder.layer.11.attention.self.query.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.5.attention.self.key.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.10.attention.self.query.bias', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.1.output.dense.bias', 'bert.encoder.layer.5.attention.self.value.weight', 'bert.encoder.layer.6.attention.self.value.bias', 'be

In [None]:
transformer_train_1 = pd.DataFrame({"text": dataset_1_train_text, "label": dataset_1_train_labels})

model.train_model(transformer_train_1, weight= [0.2, 0.8])
result, model_outputs, wrong_predictions = model.eval_model(eval_df = transformer_train_1, f1=f1_score, accuracy = accuracy_score)
result

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10041 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1256 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10041 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1256 [00:00<?, ?it/s]

{'mcc': 0.0,
 'tp': 0,
 'tn': 9190,
 'fp': 0,
 'fn': 851,
 'auroc': 0.5677559141201095,
 'auprc': 0.09774555364904958,
 'f1': 0.0,
 'accuracy': 0.915247485310228,
 'eval_loss': 0.3166033021963326}

In [23]:
transformer_train_2 = pd.DataFrame({"text": dataset_2_train_text, "label": dataset_2_train_labels})

model_2.train_model(transformer_train_2, weight= [0.2, 0.8])
result, model_outputs, wrong_predictions = model_2.eval_model(eval_df = transformer_train_2, f1=f1_score, accuracy = accuracy_score)
result

# 3. Compare the results of classification on the test set. Select the appropriate measures (from accuracy, F1, macro/micro F1, MCC) to compare the results

In [18]:
# Fasttext text classifier

results_fasttext_1_test = []
for i in dataset_1_test_text:
  res = fasttext_model_1.predict(i.replace("\n", ""))[0][0][-1]
  results_fasttext_1_test.append(int(res))

show_metrics(dataset_1_test_labels, results_fasttext_1_test)

F1: [0.93056315 0.25730994]
F1 macro: 0.5939365453911798
F1 micro: 0.8729999999999999
MCC: 0.2650301059500807
Accuracy: 0.873


In [19]:
results_fasttext_2_test = []
for i in dataset_2_test_text:
  res = fasttext_model_2.predict(i.replace("\n", ""))[0][0][-1]
  results_fasttext_2_test.append(int(res))

show_metrics(dataset_2_test_labels, results_fasttext_2_test)

F1: [0.93146249 0.12121212 0.05263158]
F1 macro: 0.36843539780455736
F1 micro: 0.868
MCC: 0.16001981125515372
Accuracy: 0.868


In [None]:
# Transformer classifier

transformer_test_1 = pd.DataFrame({"text": dataset_1_test_text, "label": dataset_1_test_labels})

model.train_model(transformer_test_1, weight= [0.2, 0.8])
result, model_outputs, wrong_predictions = model.eval_model(eval_df = transformer_test_1, f1=f1_score, accuracy = accuracy_score)
result

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/125 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

{'accuracy': 0.866,
 'auprc': 0.23652644826814134,
 'auroc': 0.7001396022198476,
 'eval_loss': 0.4077161973118782,
 'f1': 0.0,
 'fn': 134,
 'fp': 0,
 'mcc': 0.0,
 'tn': 866,
 'tp': 0}

In [22]:
transformer_test_2 = pd.DataFrame({"text": dataset_2_test_text, "label": dataset_2_test_labels})

model_2.train_model(transformer_test_2, weight= [0.2, 0.8])
result, model_outputs, wrong_predictions = model_2.eval_model(eval_df = transformer_test_2, f1=f1_score, accuracy = accuracy_score)
result

# 5. Answer the following questions:
* Which of the classifiers works the best for the task 1 and the task 2.

Fasttext i transformery osiągają podobne wyniki

* Did you achieve results comparable with the results of PolEval Task?

Nie

* Did you achieve results comparable with the Klej leaderboard?

Nie

* Describe strengths and weaknesses of each of the compared algorithms.

Fasttext działa szybko i osiąga dobre wyniki, transformery osiągają dobre wyniki, ale działają bardzo powoli

* Do you think comparison of raw performance values on a single task is enough to assess the value of a given algorithm/model?
* Did SHAP show that the models use valuable features/words when performing their decision?