In [22]:
# Install dependencies if not already installed
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import numpy as np



In [23]:
# ✅ Step 1: Load the Dataset
df = pd.read_csv("MisogynisticAttitudeDetection.csv/MisogynisticAttitudeDetection.csv")  # Change to your dataset file

# Map categorical labels to numeric values
attitude_mapping = {"Optimistic": 0, "Pessimistic": 1, "Neutral": 2}
df['SubTask1'] = df['SubTask1'].map(attitude_mapping)

# Remove NaN values (if any)
df = df.dropna(subset=['Comments', 'SubTask1'])

# ✅ Step 2: Convert Data to Hugging Face Dataset Format
dataset = Dataset.from_pandas(df[['Comments', 'SubTask1']])



In [24]:
# ✅ Step 3: Tokenize the Data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    tokens = tokenizer(
        examples["Comments"],  
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokens["labels"] = [int(label) for label in examples["SubTask1"]]  # ✅ Ensure labels are integers
    return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)




Map: 100%|██████████| 12698/12698 [00:00<00:00, 14929.96 examples/s]


In [25]:
# ✅ Step 4: Load Pre-trained BERT Model for Sequence Classification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=3,
    problem_type="single_label_classification"  # ✅ Ensure model computes loss
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# ✅ Step 5: Define Training Parameters
training_args = TrainingArguments(
    output_dir="./misogyny_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [27]:


# ✅ Load accuracy metric
metric = evaluate.load("accuracy")



In [28]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [30]:
import torch
print(torch.cuda.is_available())  # Should return True


False


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move BERT model to GPU


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# ✅ Step 7: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# ✅ Step 8: Save the Fine-tuned Model
model.save_pretrained("./misogyny_bert_model")
tokenizer.save_pretrained("./misogyny_bert_model")

print("Model training complete! 🎉")

Model training complete! 🎉
