In [None]:
!pip install --upgrade transformers datasets accelerate
import transformers
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/train_with_topics.csv')
df = df[['text', 'topic_label', 'hate_label']].dropna()
df['hate_label'] = df['hate_label'].astype(int)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
def combine_text(row):
df['input_text'] = df.apply(combine_text, axis=1)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['input_text', 'hate_label']])
val_dataset = Dataset.from_pandas(val_df[['input_text', 'hate_label']])

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("hate_label", "labels")
val_dataset = val_dataset.rename_column("hate_label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/hate_model_conditional",
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="/content/logs",
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


In [None]:
model.save_pretrained("/content/drive/MyDrive/hate_model_conditional")
tokenizer.save_pretrained("/content/drive/MyDrive/hate_model_conditional")
