In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from transformers import BatchEncoding
from torch.utils.data import Dataset
import numpy as np

# Load the dataset
df = pd.read_csv('C:/Users/gangi/OneDrive/Desktop/DL/HateSpeechData.csv')

# Load the pre-trained model and tokenizer
checkpoint = "distilbert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

class HateSpeechDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx, 0])
        label = self.df.iloc[idx, 1]

        # Map labels to the range of 0 to 1
        label = 0 if label == 0 else 1

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Create an instance of the dataset class
dataset = HateSpeechDataset(df, tokenizer, max_length=512)

# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": np.sum(predictions == labels) / len(labels)}

# Train the model
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=1)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics
)
trainer.train()

# Save the model
model.save_pretrained("CustomModels/CustomHamSpam")
tokenizer.save_pretrained("CustomModels/CustomHamSpam")