<a href="https://colab.research.google.com/github/amitskb10/amit1/blob/main/Untitled21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load dataset
df = pd.read_csv("/content/datac.csv")

In [None]:
# Step 1: Install dependencies
# =============================
!pip install transformers datasets scikit-learn torch --quiet

In [None]:
!pip install -U transformers --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m446.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Step 2: Import Libraries
# =============================
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [None]:
# Step 3: Load Dataset
# =============================
df = pd.read_csv("/content/datac.csv")  # adjust path if needed
df = df.dropna()
df.head()


Unnamed: 0,Comment,Target
0,Professor from money heist,0
1,Now the true Islamic spirit is released Allah ...,0
2,Nonsense the meaning of the greeting is May go...,0
3,Jaisi dalli police waisa unka liear kamal ka K...,1
4,Democracy on peak,0


In [None]:
# Check dataset info
print(df['Target'].value_counts())

Target
0    2451
1    1879
Name: count, dtype: int64


In [None]:
# Step 4: Dataset Class
# =============================
class HateSpeechDataset(Dataset):
    def __init__(self, comments, targets, tokenizer, max_len=128):
        self.comments = comments
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        text = str(self.comments[idx])
        labels = self.targets[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# =============================

In [None]:
 #Step 5: Train-Test Split
# =============================
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Comment"].tolist(), df["Target"].tolist(),
    test_size=0.2, random_state=42, stratify=df["Target"]
)

# =============================

In [None]:
# =============================
# Step 6: Define Function to Train and Evaluate
# =============================
def train_and_evaluate(model_name):
    print(f"\n===== Training {model_name} =====")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
    test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./logs',
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )

    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
        acc = accuracy_score(p.label_ids, preds)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    preds = trainer.predict(test_dataset)
    y_true = test_labels
    y_pred = preds.predictions.argmax(-1)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# =============================

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
report_to = "none"


In [None]:
training_args = TrainingArguments(
    output_dir=f'./results/{alias}',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"   # <- disable wandb
)


In [None]:
def train_and_evaluate(model_name, alias):
    print(f"\n===== Training {alias} ({model_name}) =====")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
    test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # OLD Transformers compatibility: no evaluation_strategy or save_strategy
    training_args = TrainingArguments(
        output_dir=f'./results/{alias}',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_steps=500,          # instead of save_strategy
        logging_dir='./logs',
        logging_steps=50
    )

    # custom metric calculation
    def compute_metrics(pred):
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(pred.label_ids, preds, average='binary')
        acc = accuracy_score(pred.label_ids, preds)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    preds = trainer.predict(test_dataset)
    y_true = test_labels
    y_pred = preds.predictions.argmax(-1)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    acc = accuracy_score(y_true, y_pred)

    results_summary.append({
        "Model": alias,
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })


In [None]:
training_args = TrainingArguments(
    output_dir=f'./results/{alias}',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=500,          # old style instead of save_strategy
    logging_dir='./logs',
    logging_steps=50
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Step 7: Run All Models
# =============================
models = {
    "BERT": "bert-base-uncased",
    "DistilBERT": "distilbert-base-uncased",
    "ALBERT": "albert-base-v2",
    "HateBERT": "GroNLP/hateBERT",
    "mBERT": "bert-base-multilingual-uncased",
    "XLM-RoBERTa": "xlm-roberta-base"
}

for alias, model_name in models.items():
    train_and_evaluate(model_name, alias)

# =============================


===== Training BERT (bert-base-uncased) =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,0.6624
100,0.6617
150,0.657


In [None]:
# =============================
summary_df = pd.DataFrame(results_summary)
print("\n===== Final Results Summary =====")
print(summary_df)

# Optional: Save to CSV
summary_df.to_csv("model_results_summary.csv", index=False)