In [35]:
from datasets import load_dataset, load_metric
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import fasttext
import numpy as np
import pandas as pd

In [36]:
dataset01 = load_dataset("poleval2019_cyberbullying", "task01")
dataset02 = load_dataset("poleval2019_cyberbullying", "task02")

Reusing dataset poleval2019_cyber_bullying (/home/arkadius/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset poleval2019_cyber_bullying (/home/arkadius/.cache/huggingface/datasets/poleval2019_cyber_bullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450)


  0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
train_dataset01 = dataset01['train']
test_dataset01 = dataset01['test']
train_dataset02 = dataset02['train']
test_dataset02 = dataset02['test']

In [38]:
train_dataset01[:20]

{'text': ['Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.',
  '@anonymized_account @anonymized_account Brawo ty Daria kibic ma być na dobre i złe',
  '@anonymized_account @anonymized_account Super, polski premier składa kwiaty na grobach kolaborantów. Ale doczekaliśmy czasów.',
  '@anonymized_account @anonymized_account Musi. Innej drogi nie mamy.',
  'Odrzut natychmiastowy, kwaśna mina, mam problem',
  'Jaki on był fajny xdd pamiętam, że spóźniłam się na jego pierwsze zajęcia i to sporo i za karę kazał mi usiąść w pierwszej ławce XD',
  '@anonymized_account No nie ma u nas szczęścia 😉',
  '@anonymized_account Dawno kogoś tak wrednego nie widziałam xd',
  '@anonymized_account @anonymized_account Zaległości były, ale ważne czy były wezwania do zapłaty z których się klub nie wywiązał.',
  '@anonymized_account @anonymized_account @anonymized_account Gdzie jest @anonymized_account . Brudziński jesteś kłamcą i marnym kutasem @anonymized_account',
  '@anonymiz

In [39]:
train_dataset02[:20]

{'text': ['Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.',
  '@anonymized_account @anonymized_account Brawo ty Daria kibic ma być na dobre i złe',
  '@anonymized_account @anonymized_account Super, polski premier składa kwiaty na grobach kolaborantów. Ale doczekaliśmy czasów.',
  '@anonymized_account @anonymized_account Musi. Innej drogi nie mamy.',
  'Odrzut natychmiastowy, kwaśna mina, mam problem',
  'Jaki on był fajny xdd pamiętam, że spóźniłam się na jego pierwsze zajęcia i to sporo i za karę kazał mi usiąść w pierwszej ławce XD',
  '@anonymized_account No nie ma u nas szczęścia 😉',
  '@anonymized_account Dawno kogoś tak wrednego nie widziałam xd',
  '@anonymized_account @anonymized_account Zaległości były, ale ważne czy były wezwania do zapłaty z których się klub nie wywiązał.',
  '@anonymized_account @anonymized_account @anonymized_account Gdzie jest @anonymized_account . Brudziński jesteś kłamcą i marnym kutasem @anonymized_account',
  '@anonymiz

## Classifiers

In [80]:
scores_columns = ["Classifier", "Accuracy", "Precision", "Recall", "F1"]

def to_scores_df(model_name,scores_dict):
    return pd.DataFrame(data=[[
        model_name,
        scores_dict["accuracy"],
        scores_dict["precision"],
        scores_dict["recall"],
        scores_dict["f1"],
    ]], columns=scores_columns)

scores_singleclass_df = pd.DataFrame(data=[], columns=scores_columns)
scores_multiclass_df = pd.DataFrame(data=[], columns=scores_columns)

#### Baessian + TF-IDF

In [44]:
def tf_idf(train_dataset, test_dataset):    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_dataset)
    return vectorizer.transform(train_dataset).toarray(), vectorizer.transform(test_dataset).toarray()

train_dataset01_tfidf, test_dataset01_tfidf = tf_idf(train_dataset01['text'], test_dataset01['text'])
train_dataset02_tfidf, test_dataset02_tfidf = tf_idf(train_dataset02['text'], test_dataset02['text'])

In [45]:
gnb01 = GaussianNB()
gnb01.fit(train_dataset01_tfidf, train_dataset01['label'])

GaussianNB()

In [46]:
gnb02 = GaussianNB()
gnb02.fit(train_dataset02_tfidf, train_dataset02['label'])

GaussianNB()

In [63]:
def evaluate(predictions, test_y):
    y = predictions
    accuracy = accuracy_score(y, test_y)
    precision = precision_score(y, test_y)
    recall = recall_score(y, test_y)
    f1 = f1_score(y, test_y)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

def evaluate_multiclass(predictions, test_y):
    y = predictions
    accuracy = accuracy_score(y, test_y)
    precision = precision_score(y, test_y, average='macro')
    recall = recall_score(y, test_y, average='macro')
    f1 = f1_score(y, test_y, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

In [81]:
gnb01_scores = evaluate(gnb01.predict(test_dataset01_tfidf), test_dataset01['label'])
gnb02_scores = evaluate_multiclass(gnb02.predict(test_dataset02_tfidf), test_dataset02['label'])
scores_singleclass_df = scores_singleclass_df.append(to_scores_df("GaussianNaiveBayes", gnb01_scores))
scores_multiclass_df = scores_multiclass_df.append(to_scores_df("GaussianNaiveBayes", gnb02_scores))

#### Fasttext  

In [66]:
help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might want to consult standard preprocessi

In [75]:
def to_fast_text_input_file(dataset, filename):
    with open(filename, "w") as f:
        for label, text in zip(dataset['label'], dataset['text']):
            f.write(f"__label__{label} {text}\n")

to_fast_text_input_file(train_dataset01, 'fasttext_train01.txt')
to_fast_text_input_file(test_dataset01, 'fasttext_test01.txt')
to_fast_text_input_file(train_dataset02, 'fasttext_train02.txt')
to_fast_text_input_file(test_dataset02, 'fasttext_test02.txt')

In [76]:
fasttext_model01 = fasttext.train_supervised('fasttext_train01.txt')
fasttext_model02 = fasttext.train_supervised('fasttext_train02.txt')

In [82]:
def fasttext_scores_dict(test_res):
    return { "accuracy": test_res[1], "precision": None, "recall": None, "f1": None }

fasttext_model01_scores = fasttext_scores_dict(fasttext_model01.test('fasttext_test01.txt'))
fasttext_model02_scores = fasttext_scores_dict(fasttext_model01.test('fasttext_test02.txt'))
scores_singleclass_df = scores_singleclass_df.append(to_scores_df("fastText", fasttext_model01_scores))
scores_multiclass_df = scores_multiclass_df.append(to_scores_df("fastText", fasttext_model02_scores))

### Transformers
https://huggingface.co/docs/transformers/custom_datasets

In [99]:
def tokenized(tokenizer_name, dataset):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)
    tokenized_dt = dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return tokenizer, tokenized_dt

def fine_tuned(model_name, dataset, expected_labels):
    print("Tokenization")
    tokenizer, tokenized_dt = tokenized(model_name, dataset)
    print("Training")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=expected_labels)
    
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dt["train"],
        eval_dataset=tokenized_dt["test"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()
    return model

In [6]:
# herbert_fine_tuned = fine_tuned("allegro/herbert-base-cased", dataset01, 2)
# herbert_fine_tuned.save_pretrained("herbert-base-cased-bullying")

herbert_fine_tuned = AutoModelForSequenceClassification.from_pretrained("herbert-base-cased-bullying", local_files_only=True)

In [7]:
# herbert_fine_tuned02 = fine_tuned("allegro/herbert-base-cased", dataset02, 3)
# herbert_fine_tuned02.save_pretrained("herbert-base-cased-bullying02")

herbert_fine_tuned02 =  AutoModelForSequenceClassification.from_pretrained("herbert-base-cased-bullying02", local_files_only=True)

In [100]:
def compute_metrics_singleclass(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    return evaluate(pred, labels)

def compute_metrics_multiclass(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    return evaluate_multiclass(pred, labels)

def evaluate_transformers(model, dataset, tokenizer_name, compute_metrics):
    print("Tokenization")
    tokenizer, tokenized_dt = tokenized(tokenizer_name, dataset)
    print("Evaluation")
    trainer = Trainer(model=model,
                      eval_dataset=tokenized_dt["test"],
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)

    return trainer.evaluate()

In [101]:
def transformers_to_scores(eval_res):
    return {"accuracy": eval_res["eval_accuracy"], "precision": eval_res["eval_precision"], "recall": eval_res["eval_recall"], "f1": eval_res["eval_f1"]} 

herbert_fine_tuned_score = transformers_to_scores(
    evaluate_transformers(herbert_fine_tuned, dataset01, "allegro/herbert-base-cased", compute_metrics_multiclass))
herbert_fine_tuned02_score = transformers_to_scores(
    evaluate_transformers(herbert_fine_tuned02, dataset02, "allegro/herbert-base-cased", compute_metrics_multiclass))

scores_singleclass_df = scores_singleclass_df.append(to_scores_df("transformers_herbert-cased", herbert_fine_tuned_score))
scores_multiclass_df = scores_multiclass_df.append(to_scores_df("transformers_herbert-cased", herbert_fine_tuned02_score))

Tokenization


loading configuration file https://huggingface.co/allegro/herbert-base-cased/resolve/main/config.json from cache at /home/arkadius/.cache/huggingface/transformers/d24c58747dbe6b61ed3e1eb5d488dfec9332ed13dd3f8983588f30d96f6f1bde.193ae07fbea6bb9ac46f854cd03094e486dfa4483e0596fd6a159dcfaef521a5
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "HerbertTokenizerFast",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading file https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.

Evaluation


Tokenization


loading configuration file https://huggingface.co/allegro/herbert-base-cased/resolve/main/config.json from cache at /home/arkadius/.cache/huggingface/transformers/d24c58747dbe6b61ed3e1eb5d488dfec9332ed13dd3f8983588f30d96f6f1bde.193ae07fbea6bb9ac46f854cd03094e486dfa4483e0596fd6a159dcfaef521a5
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "HerbertTokenizerFast",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading file https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.

Evaluation


## Results

In [102]:
scores_singleclass_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1
0,GaussianNaiveBayes,0.782,0.298507,0.243902,0.268456
0,fastText,0.873,,,
0,transformers_herbert_cased,0.904,0.833333,0.701717,0.744613
0,transformers_herbert_cased,0.904,0.701717,0.833333,0.744613
0,transformers_herbert-cased,0.904,0.701717,0.833333,0.744613


In [103]:
scores_multiclass_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1
0,GaussianNaiveBayes,0.787,0.408183,0.401325,0.396831
0,fastText,0.968575,,,
0,transformers_herbert_cased,0.881,0.5762,0.463077,0.488148
0,transformers_herbert_cased,0.881,0.463077,0.5762,0.488148
0,transformers_herbert-cased,0.881,0.463077,0.5762,0.488148
