In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('dataset.csv')
df = df.drop(columns=['annotaters'], errors='ignore')

In [2]:
def preprocess_data(df):
    df_clean = df.copy()
    
    df_clean['hate'] = df_clean['label'].map({'h': 1, 'nh': 0}) # binary hate, non-hate
    
    target_map = {'p': 0, 'e': 1, 'r': 2} # numeric mapping of target categories
    df_clean['target'] = df_clean['target'].str.lower().str.strip()
    
    # non-hate labels have no target
    df_clean['target'] = (
        df_clean['target']
        .map(target_map)
        .where(df_clean['target'].isin(target_map.keys()))
    )
    df_clean['target'] = df_clean['target'].fillna(-100).astype(int)
    
    invalid_hate_mask = (df_clean['hate'] == 1) & (df_clean['target'] == -100)
    df_clean.loc[invalid_hate_mask, 'hate'] = 0
    
    return df_clean

In [3]:
def validate_dataset(df):
    assert set(df['hate'].unique()).issubset({0, 1}), f"Invalid hate labels: {df['hate'].unique()}"
    
    valid_targets = {-100, 0, 1, 2}
    invalid_targets = set(df['target'].unique()) - valid_targets
    assert not invalid_targets, f"Invalid targets detected: {invalid_targets}"
    
    nh_mask = df['hate'] == 0
    assert (df.loc[nh_mask, 'target'] == -100).all(), "Non-hate samples have invalid targets"
    
    assert not df['text'].isna().any(), "NaN in sentence column"
    assert not df['hate'].isna().any(), "NaN in hate column"
    assert not df['target'].isna().any(), "NaN in target column"
    
    print("All dataset validation checks passed!")

In [4]:
df_clean = preprocess_data(df)

validate_dataset(df_clean)

All dataset validation checks passed!


In [5]:
len(df_clean)

3597

In [6]:
import torch
from transformers import AutoTokenizer, AutoModel, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import numpy as np

class TurkishHateSpeechDataset(Dataset):
    def __init__(self, texts, hate_labels, target_labels, tokenizer, max_len=128):
        self.texts = texts
        self.hate_labels = hate_labels
        self.target_labels = target_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'hate_labels': torch.tensor(self.hate_labels[idx], dtype=torch.float),
            'target_labels': torch.tensor(self.target_labels[idx], dtype=torch.long)
        }

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class TurkishHateBERT(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")
        self.hate_head = torch.nn.Linear(768, 1)
        self.target_head = torch.nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.hate_head(pooled_output), self.target_head(pooled_output)

In [8]:
def prepare_loaders(df, tokenizer, batch_size=16):
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['hate'])
    
    train_dataset = TurkishHateSpeechDataset(
        train_df['text'].values,
        train_df['hate'].values,
        train_df['target'].values,
        tokenizer
    )
    
    test_dataset = TurkishHateSpeechDataset(
        test_df['text'].values,
        test_df['hate'].values,
        test_df['target'].values,
        tokenizer
    )

    return DataLoader(train_dataset, batch_size=batch_size, shuffle=True), \
           DataLoader(test_dataset, batch_size=batch_size)

In [9]:
def evaluate_model(model, dataloader, device):
    model.eval()
    hate_preds = []
    hate_probs = []
    true_hate = []
    target_preds = []
    true_target = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            hate_logits, target_logits = model(input_ids, attention_mask)
            
            # Hate predictions
            batch_probs = torch.sigmoid(hate_logits.squeeze()).cpu().numpy()
            batch_preds = (batch_probs > 0.5).astype(int)
            
            hate_probs.extend(batch_probs)
            hate_preds.extend(batch_preds)
            true_hate.extend(batch['hate_labels'].cpu().numpy())
            
            # Target predictions
            target_probs = torch.softmax(target_logits, dim=1).cpu().numpy()
            batch_target_preds = np.argmax(target_probs, axis=1)
            
            target_preds.extend(batch_target_preds)
            true_target.extend(batch['target_labels'].cpu().numpy())

    # Filter target predictions for valid labels
    target_mask = np.array(true_target) != -100
    filtered_target_pred = np.array(target_preds)[target_mask]
    filtered_true_target = np.array(true_target)[target_mask]

    return {
        'true_hate': true_hate,
        'pred_hate': hate_preds,
        'true_target': filtered_true_target,
        'pred_target': filtered_target_pred
    }

In [13]:
import nlpaug.augmenter.word as naw
import random

turkish_augmenter = naw.ContextualWordEmbsAug(
    model_path='bert-base-multilingual-cased',
    action="substitute",
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [14]:
def augment_sentences(df, augmenter, aug_p=0.3, max_aug_per_sample=1):
    augmented_rows = []

    for i, row in df.iterrows():
        if row['hate'] == 1 and random.random() < aug_p:
            try:
                for _ in range(max_aug_per_sample):
                    aug_text = augmenter.augment(row['text'])
                    if isinstance(aug_text, list):
                        aug_text = aug_text[0]  # grab first if list returned
                    new_row = row.copy()
                    new_row['text'] = aug_text
                    augmented_rows.append(new_row)
            except Exception as e:
                print(f"Augmentation failed for row {i}: {e}")
                continue

    if augmented_rows:
        aug_df = pd.DataFrame(augmented_rows)
        print(f"Added {len(aug_df)} augmented samples.")
        return pd.concat([df, aug_df], ignore_index=True)
    else:
        return df

In [15]:
df_augmented = augment_sentences(df_clean, turkish_augmenter)

Added 513 augmented samples.


In [16]:
df_augmented[['text', 'hate']].sample(5)

Unnamed: 0,text,hate
440,Gerçek bu kabullensenizde etmesenizde Akpnn es...,1
2255,Ya ne saçmalıyorsun amk üşüşmüş buraya sizin g...,1
2314,"> KK'yı dinleye dinleye beyinleriniz uyuşmuş ,...",1
3929,"in. As as the way back, ben no gibi gözükmüyor...",1
842,Bir de hep gördüğüm şey Kürdistan nErEsİ yA bE...,1


In [17]:
len(df_augmented)

4110

In [18]:
df_augmented.to_csv("augmented_dataset.csv", index=False)

In [27]:
train_df, val_df = train_test_split(
    df_augmented, 
    test_size=0.2, 
    stratify=df_augmented['hate'], 
    random_state=42
)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

Train size: 3288, Validation size: 822


In [28]:
def prepare_loaders_augmented(train_df, val_df, tokenizer, batch_size=16):
    train_dataset = TurkishHateSpeechDataset(
        train_df['text'].values,
        train_df['hate'].values,
        train_df['target'].values,
        tokenizer
    )
    
    val_dataset = TurkishHateSpeechDataset(
        val_df['text'].values,
        val_df['hate'].values,
        val_df['target'].values,
        tokenizer
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    return train_loader, val_loader

In [32]:
def add_hard_negatives(model, df_train, tokenizer, device, threshold=0.3, max_add=100):
    model.eval()
    non_hate_df = df_train[df_train['hate'] == 0].copy()

    texts = non_hate_df['text'].tolist()
    hate_preds = []
    
    with torch.no_grad():
        for i in range(0, len(texts), 32):
            batch_texts = texts[i:i+32]
            encodings = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            hate_logits, _ = model(input_ids, attention_mask)
            probs = torch.sigmoid(hate_logits).detach().cpu().numpy().flatten()
            hate_preds.extend(probs)
    
    non_hate_df['hate_prob'] = hate_preds
    hard_negatives = non_hate_df[non_hate_df['hate_prob'] > threshold].copy()

    hard_negatives = hard_negatives.sample(min(len(hard_negatives), max_add), random_state=42)
    
    print(f"Adding {len(hard_negatives)} hard negatives to training data")

    df_new_train = pd.concat([df_train, hard_negatives], ignore_index=True)
    return df_new_train

In [33]:
def train_model_with_hard_neg(model, train_df, val_df, tokenizer, device, epochs=4, batch_size=16, lr=2e-5, threshold=0.3):
    optimizer = AdamW(model.parameters(), lr=lr)
    hate_criterion = torch.nn.BCEWithLogitsLoss()
    target_criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
    
    model.to(device)
    
    for epoch in range(epochs):
        train_loader, val_loader = prepare_loaders_augmented(train_df, val_df, tokenizer, batch_size)
        
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            hate_labels = batch['hate_labels'].to(device)
            target_labels = batch['target_labels'].to(device)
            
            hate_logits, target_logits = model(input_ids, attention_mask)
            
            hate_loss = hate_criterion(hate_logits.squeeze(), hate_labels)
            
            target_mask = (target_labels != -100)
            if target_mask.any():
                target_loss = target_criterion(target_logits[target_mask], target_labels[target_mask])
            else:
                target_loss = torch.tensor(0.0).to(device)
            
            loss = hate_loss + target_loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs} - Train loss: {total_loss/len(train_loader):.4f}")
        
        val_metrics = evaluate_model(model, val_loader, device)
        print("\nValidation Metrics:")
        print(classification_report(val_metrics['true_hate'], val_metrics['pred_hate'], target_names=['Non-Hate', 'Hate']))
        print("\nTarget Classification (Hate Cases Only):")
        print(classification_report(val_metrics['true_target'], val_metrics['pred_target'], target_names=['Politics', 'Ethnicity', 'Religion']))
        
        # Add hard negatives for the next epoch training
        train_df = add_hard_negatives(model, train_df, tokenizer, device, threshold=threshold)
    
    return model


In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = TurkishHateBERT()

trained_model = train_model_with_hard_neg(
    model,
    train_df,  # (augmented + original)
    val_df,    
    tokenizer,
    device,
    epochs=4,
    batch_size=16,
    lr=2e-5,
    threshold=0.3
)



Epoch 1/4 - Train loss: 1.3963

Validation Metrics:
              precision    recall  f1-score   support

    Non-Hate       0.76      0.59      0.67       392
        Hate       0.69      0.83      0.75       430

    accuracy                           0.72       822
   macro avg       0.73      0.71      0.71       822
weighted avg       0.72      0.72      0.71       822


Target Classification (Hate Cases Only):
              precision    recall  f1-score   support

    Politics       0.88      0.68      0.77       164
   Ethnicity       0.82      0.79      0.80       187
    Religion       0.56      0.87      0.68        79

    accuracy                           0.76       430
   macro avg       0.75      0.78      0.75       430
weighted avg       0.80      0.76      0.77       430

Adding 100 hard negatives to training data
Epoch 2/4 - Train loss: 0.8914

Validation Metrics:
              precision    recall  f1-score   support

    Non-Hate       0.73      0.82      0.77     

In [35]:
_, val_loader = prepare_loaders_augmented(train_df, val_df, tokenizer, batch_size=16)
final_metrics = evaluate_model(trained_model, val_loader, device)

In [36]:
from sklearn.metrics import classification_report

print("Hate Speech Detection (Binary):")
print(classification_report(final_metrics['true_hate'], final_metrics['pred_hate'], target_names=['Non-Hate', 'Hate']))

print("\nHate Target Classification (Multi-class, on hate samples only):")
print(classification_report(final_metrics['true_target'], final_metrics['pred_target'], target_names=['Politics', 'Ethnicity', 'Religion']))

Hate Speech Detection (Binary):
              precision    recall  f1-score   support

    Non-Hate       0.78      0.81      0.80       392
        Hate       0.82      0.79      0.81       430

    accuracy                           0.80       822
   macro avg       0.80      0.80      0.80       822
weighted avg       0.80      0.80      0.80       822


Hate Target Classification (Multi-class, on hate samples only):
              precision    recall  f1-score   support

    Politics       0.82      0.89      0.86       164
   Ethnicity       0.89      0.80      0.84       187
    Religion       0.78      0.84      0.80        79

    accuracy                           0.84       430
   macro avg       0.83      0.84      0.83       430
weighted avg       0.84      0.84      0.84       430



In [37]:
from sklearn.metrics import precision_recall_fscore_support

p, r, f1, _ = precision_recall_fscore_support(final_metrics['true_hate'], final_metrics['pred_hate'], average='macro')
print(f"Macro F1 for Hate Detection: {f1:.4f}")

p_t, r_t, f1_t, _ = precision_recall_fscore_support(final_metrics['true_target'], final_metrics['pred_target'], average='macro')
print(f"Macro F1 for Target Classification: {f1_t:.4f}")

Macro F1 for Hate Detection: 0.8004
Macro F1 for Target Classification: 0.8335


In [40]:
torch.save(trained_model.state_dict(), 'turkish_hate_bert_with_hard_negatives.pth')

In [41]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

In [43]:
model_baseline = TurkishHateBERT()
model_baseline.load_state_dict(torch.load("dilbert_model_vanilla.pth", map_location=torch.device('cpu')))
model_baseline.to(device)

  model_baseline.load_state_dict(torch.load("dilbert_model_vanilla.pth", map_location=torch.device('cpu')))


TurkishHateBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [44]:
final_metrics_baseline = evaluate_model(model_baseline, val_loader, device)

In [45]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

print("Hate Speech Detection (Binary):")
print(classification_report(final_metrics_baseline['true_hate'], final_metrics_baseline['pred_hate'], target_names=['Non-Hate', 'Hate']))

print("\nHate Target Classification (Multi-class, on hate samples only):")
print(classification_report(final_metrics_baseline['true_target'], final_metrics_baseline['pred_target'], target_names=['Politics', 'Ethnicity', 'Religion']))

p, r, f1, _ = precision_recall_fscore_support(final_metrics_baseline['true_hate'], final_metrics_baseline['pred_hate'], average='macro')
print(f"Macro F1 for Hate Detection: {f1:.4f}")

p_t, r_t, f1_t, _ = precision_recall_fscore_support(final_metrics_baseline['true_target'], final_metrics_baseline['pred_target'], average='macro')
print(f"Macro F1 for Target Classification: {f1_t:.4f}")

Hate Speech Detection (Binary):
              precision    recall  f1-score   support

    Non-Hate       0.88      0.97      0.92       392
        Hate       0.97      0.88      0.92       430

    accuracy                           0.92       822
   macro avg       0.92      0.92      0.92       822
weighted avg       0.93      0.92      0.92       822


Hate Target Classification (Multi-class, on hate samples only):
              precision    recall  f1-score   support

    Politics       0.94      0.94      0.94       164
   Ethnicity       0.95      0.95      0.95       187
    Religion       0.92      0.92      0.92        79

    accuracy                           0.94       430
   macro avg       0.94      0.94      0.94       430
weighted avg       0.94      0.94      0.94       430

Macro F1 for Hate Detection: 0.9221
Macro F1 for Target Classification: 0.9384
