In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from ast import literal_eval

In [None]:
HS_PATH = '../data/span_annotations_reviewed'

df_spans = pd.read_csv(f'{HS_PATH}/common_annotations_with_tags.csv')
df_not_spans = pd.read_csv(f'{HS_PATH}/nonhateful_tweets_with_tags.csv')

df_all = pd.concat([df_spans, df_not_spans.sample(df_spans.shape[0])])
df_all = pd.concat([df_spans, df_not_spans])

#df_all = df_spans
df_all['tokens'] = df_all['tokens'].apply(literal_eval)
df_all['tags'] = df_all['tags'].apply(literal_eval)

train_df, temp_df = train_test_split(df_all, test_size=0.2, random_state=42)

# Split temp set into validation and test sets (50% validation, 50% test of the temp set which is 10% each of the original)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create DatasetDict
ds = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(ds)

DatasetDict({
    train: Dataset({
        features: ['Tweet_id', 'Target', 'Text', 'Span', 'tokens', 'tags', 'genel_tutum', 'hedef_grup', 'derece', 'kategori', 'saldırgan_dil', 'annotator', 'uploaded_at', 'konu', 'dil', '__index_level_0__'],
        num_rows: 1836
    })
    validation: Dataset({
        features: ['Tweet_id', 'Target', 'Text', 'Span', 'tokens', 'tags', 'genel_tutum', 'hedef_grup', 'derece', 'kategori', 'saldırgan_dil', 'annotator', 'uploaded_at', 'konu', 'dil', '__index_level_0__'],
        num_rows: 229
    })
    test: Dataset({
        features: ['Tweet_id', 'Target', 'Text', 'Span', 'tokens', 'tags', 'genel_tutum', 'hedef_grup', 'derece', 'kategori', 'saldırgan_dil', 'annotator', 'uploaded_at', 'konu', 'dil', '__index_level_0__'],
        num_rows: 230
    })
})


In [None]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)


    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l]  for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    print(results)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import AutoTokenizer


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):

        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            else: # label all tokens of a given word.
              label_ids.append(label[word_idx])
            # Only label the first token of a given word.
            # elif word_idx != previous_word_idx:
            #    label_ids.append(label[word_idx])
            # else:
            #    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

id2label = {
    0: "O",
    1: "B-HATE",
    2: "I-HATE",
}
label2id = {
    "O": 0,
    "B-HATE": 1,
    "I-HATE": 2,

}
label_list = list(id2label.values())
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
tokenized_dataset = ds.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/1836 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(
    "dbmdz/bert-base-turkish-cased", num_labels=3
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='BERTurk_hate_span_all/',
    #learning_rate=2e-5,
    #per_device_train_batch_size=16,
    #per_device_eval_batch_size=16,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    greater_is_better=False,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
    push_to_hub=False,
    #weight_decay=0.01,
    #report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.evaluate(tokenized_dataset["test"])

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1724,0.127283,0.247664,0.490741,0.329193,0.95965
2,0.1228,0.140954,0.386555,0.425926,0.405286,0.968397
3,0.0564,0.109368,0.395522,0.490741,0.438017,0.971877
4,0.0414,0.122616,0.519231,0.5,0.509434,0.973947
5,0.0165,0.154784,0.435897,0.472222,0.453333,0.971313
6,0.0069,0.195906,0.56044,0.472222,0.512563,0.974887


{'HATE': {'precision': 0.24766355140186916, 'recall': 0.49074074074074076, 'f1': 0.32919254658385094, 'number': 108}, 'overall_precision': 0.24766355140186916, 'overall_recall': 0.49074074074074076, 'overall_f1': 0.32919254658385094, 'overall_accuracy': 0.9596501128668171}
{'HATE': {'precision': 0.3865546218487395, 'recall': 0.42592592592592593, 'f1': 0.4052863436123348, 'number': 108}, 'overall_precision': 0.3865546218487395, 'overall_recall': 0.42592592592592593, 'overall_f1': 0.4052863436123348, 'overall_accuracy': 0.9683972911963883}
{'HATE': {'precision': 0.39552238805970147, 'recall': 0.49074074074074076, 'f1': 0.4380165289256198, 'number': 108}, 'overall_precision': 0.39552238805970147, 'overall_recall': 0.49074074074074076, 'overall_f1': 0.4380165289256198, 'overall_accuracy': 0.971877351392024}
{'HATE': {'precision': 0.5192307692307693, 'recall': 0.5, 'f1': 0.509433962264151, 'number': 108}, 'overall_precision': 0.5192307692307693, 'overall_recall': 0.5, 'overall_f1': 0.509433

{'HATE': {'precision': 0.6324786324786325, 'recall': 0.5174825174825175, 'f1': 0.5692307692307692, 'number': 143}, 'overall_precision': 0.6324786324786325, 'overall_recall': 0.5174825174825175, 'overall_f1': 0.5692307692307692, 'overall_accuracy': 0.9700275064023522}


{'eval_loss': 0.12922142446041107,
 'eval_precision': 0.6324786324786325,
 'eval_recall': 0.5174825174825175,
 'eval_f1': 0.5692307692307692,
 'eval_accuracy': 0.9700275064023522,
 'eval_runtime': 1.6595,
 'eval_samples_per_second': 138.597,
 'eval_steps_per_second': 17.475,
 'epoch': 6.0}

In [None]:
predictions, label_ids, metrics = trainer.predict(tokenized_dataset["test"])

ix = np.random.randint(len(tokenized_dataset["test"]))
print('Text', tokenized_dataset["test"]['Text'][ix],)
print('Label', ''.join([tokenizer.convert_ids_to_tokens(token) if label in [1, 2] else '-' for label, token in zip(tokenized_dataset["test"]['labels'][ix],tokenized_dataset["test"]['input_ids'][ix] ) ]))
' '.join([tokenizer.convert_ids_to_tokens(token) if label in [1, 2] else '-' for label, token in zip( np.argmax(predictions[ix],axis=1)[:len(tokenized_dataset["test"]['input_ids'][ix])],tokenized_dataset["test"]['input_ids'][ix] ) ]).replace(' ##' , '')

Text Zâlimleri kahret Ya Rab
  Kahhar ol katil israil 
#Terroristisrail

#GazzedeKatliamVar 

#GazaUnderAttack

#GenocideinGaza https://t.co/snDMgzeJbO
Label ------------------------------------------------------


'- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -'

In [None]:
#!pip install --upgrade huggingface_hub
#!huggingface-cli login

trainer.push_to_hub()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cl

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/gokceuludogan/BERTurk_hate_span_all/commit/44a8a46e52cd8170b4c332d8bf39cf8e1abbd613', commit_message='End of training', commit_description='', oid='44a8a46e52cd8170b4c332d8bf39cf8e1abbd613', pr_url=None, pr_revision=None, pr_num=None)

## Inference with Hugging Face Pipeline

In [None]:
from transformers import pipeline

token_classifier = pipeline(model="gokceuludogan/BERTurk_hate_span_all", aggregation_strategy="simple")

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/755k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
ix = 53 # np.random.randint(len(tokenized_dataset["test"]))
sentence = tokenized_dataset["test"]['Text'][ix]
print(sentence)
tokens = token_classifier(sentence)
tokens

Hankendi'nde kalan sonuncu  ermeniler bir kez daha sokaklara döküldü.
 Aslında Ermeni medyası bunu Azerbaycan karşıtı olarak paylaşır. 
Öyle deyil. Onlar Azerbaycan ordusun karşılamaya hazırlasır😂
Bu ermenileride aldatdilar. Zavalilara 15 maysa qeder dayanın dediler.😂 https://t.co/5FWdnEQw9S


[{'entity_group': 'LABEL_0',
  'score': 0.96923655,
  'word': "Hankendi ' nde kalan",
  'start': 0,
  'end': 18},
 {'entity_group': 'LABEL_1',
  'score': 0.38793626,
  'word': 'sonun',
  'start': 19,
  'end': 24},
 {'entity_group': 'LABEL_2',
  'score': 0.437279,
  'word': '##cu',
  'start': 24,
  'end': 26},
 {'entity_group': 'LABEL_1',
  'score': 0.49920994,
  'word': 'er',
  'start': 28,
  'end': 30},
 {'entity_group': 'LABEL_2',
  'score': 0.6165258,
  'word': '##meniler',
  'start': 30,
  'end': 37},
 {'entity_group': 'LABEL_0',
  'score': 0.86957717,
  'word': 'bir kez daha sokaklara döküldü. Aslında Ermeni medyası bunu Azerbaycan karşıtı olarak paylaşır. Öyle deyil. Onlar Azerbaycan ordusun karşılamaya hazırlasır😂 Bu',
  'start': 38,
  'end': 199},
 {'entity_group': 'LABEL_1',
  'score': 0.5421688,
  'word': 'ermeni',
  'start': 200,
  'end': 206},
 {'entity_group': 'LABEL_0',
  'score': 0.975733,
  'word': '##leride aldatdilar. Zavalilara 15 maysa qeder dayanın dediler. 😂 https

In [None]:
def merge_spans(data):
  merged_data = []
  current_span = None

  for entry in data:
      if entry['entity_group'] not in ['LABEL_1', 'LABEL_2']:
          merged_data.append(entry)
          continue
      if current_span is None:
          current_span = entry
      else:
          if entry['start'] == current_span['end']:
              current_span['word'] += entry['word'].strip('##')
              current_span['end'] = entry['end']
          elif entry['start'] - current_span['end'] == 1:
              current_span['word'] += ' ' + entry['word'].strip('##')
              current_span['end'] = entry['end']
          else:
              merged_data.append(current_span)
              current_span = entry

  if current_span is not None:
      merged_data.append(current_span)
  return merged_data

def annotate_text(data):
    # Sort the data by 'start' value
    sorted_data = sorted(data, key=lambda x: x['start'])

    # Generate the result list
    result_list = []

    for item in sorted_data:
        word = item['word']
        if item['entity_group'] in ['LABEL_1', 'LABEL_2']:
            result_list.append((word, ""))
        else:
            result_list.append(word.strip('##'))

    return result_list
annotate_text(merge_spans(tokens))

["Hankendi ' nde kalan",
 ('sonuncu', ''),
 ('ermeniler', ''),
 'bir kez daha sokaklara döküldü. Aslında Ermeni medyası bunu Azerbaycan karşıtı olarak paylaşır. Öyle deyil. Onlar Azerbaycan ordusun karşılamaya hazırlasır😂 Bu',
 ('ermeni', ''),
 'leride aldatdilar. Zavalilara 15 maysa qeder dayanın dediler. 😂 https : / / t. co / 5FWdnEQw9S']

## Inference with HuggingFace Inference API

In [None]:
import requests
import os 
API_URL = "https://api-inference.huggingface.co/models/gokceuludogan/BERTurk_hate_span_all"
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
payload = {
	  "inputs": sentence
	}
response = requests.post(API_URL, headers=headers, json=payload)
print(response.json())

[{'entity_group': 'LABEL_0', 'score': 0.9579712748527527, 'word': 'Herkes gitsin,', 'start': 0, 'end': 14}, {'entity_group': 'LABEL_1', 'score': 0.4472111463546753, 'word': 'Ermeni', 'start': 15, 'end': 21}, {'entity_group': 'LABEL_2', 'score': 0.7423856854438782, 'word': 'lobisinden', 'start': 22, 'end': 32}, {'entity_group': 'LABEL_0', 'score': 0.9785003662109375, 'word': 'daha büyük lobi kurulmalı. Devlet destek olarak her sene gidiş dönüş uçak bileti ve 1000 $ vermeli gidenlere. Gidenler kazançlarının büyük bölümünü Türkiye de yatırım yapınca vergi ile verilen para geri alınır', 'start': 33, 'end': 243}]
