In [20]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report



In [21]:
# Load dataset
file_path = "MisogynisticAttitudeDetection.csv/MisogynisticAttitudeDetection.csv"
df = pd.read_csv(file_path)



In [22]:
# Assuming dataset has 'comment' and 'label' columns
df.head(5)


Unnamed: 0,Identifier,Comments,SubTask1,SubTask2
0,674,Us society me aurat ko izzat kese mil sakti ha...,Pessimistic,Criticism
1,11747,Ye patna ko badnam krne ki kosis n kijiye poli...,Pessimistic,Criticism
2,2811,Dusro ki bahan ke sath wo vyawahar kabhi na kr...,Optimistic,Suggestion
3,587,Ye sab ke sakkar main mat aao. Ladkiyo ne haza...,Optimistic,
4,6473,Yehi hai apna sachai ladai hai ✒,Neutral,


In [23]:
df=df[['Comments', 'SubTask1', 'SubTask2']]

In [24]:
df=df.dropna(subset=['SubTask1'])
df=df.dropna(subset=['SubTask2'])
df.head()


Unnamed: 0,Comments,SubTask1,SubTask2
0,Us society me aurat ko izzat kese mil sakti ha...,Pessimistic,Criticism
1,Ye patna ko badnam krne ki kosis n kijiye poli...,Pessimistic,Criticism
2,Dusro ki bahan ke sath wo vyawahar kabhi na kr...,Optimistic,Suggestion
5,Yes Bhai hun Youth hi Youth ko sudhar sakte ha...,Optimistic,Suggestion
8,"Oh beti , ladkiya bhi r@pe karti hai kisne kah...",Pessimistic,Criticism


In [25]:
attitude_mapping = {"Optimistic": 0, "Pessimistic": 1, "Neutral": 2}
category_mapping = {"Appreciation": 0, "Suggestion": 1, "Criticism": 2, "Offensive": 3, "None": 4}
df['SubTask1'] = df['SubTask1'].map(attitude_mapping)
df['SubTask2'] = df['SubTask2'].map(category_mapping)
df.head()

Unnamed: 0,Comments,SubTask1,SubTask2
0,Us society me aurat ko izzat kese mil sakti ha...,1,2
1,Ye patna ko badnam krne ki kosis n kijiye poli...,1,2
2,Dusro ki bahan ke sath wo vyawahar kabhi na kr...,0,1
5,Yes Bhai hun Youth hi Youth ko sudhar sakte ha...,0,1
8,"Oh beti , ladkiya bhi r@pe karti hai kisne kah...",1,2


In [26]:
# Text preprocessing function
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove punctuation & numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['cleaned_comment'] = df['Comments'].apply(clean_text)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhakt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Split data for SubTask1 (Optimistic, Pessimistic, Neutral)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_comment'], df['SubTask1'], test_size=0.2, random_state=42)



In [28]:
# Convert text into numerical vectors (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [29]:
# Train Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Results (SubTask1 - Attitude Detection):\n", classification_report(y_test, y_pred_lr))


Logistic Regression Results (SubTask1 - Attitude Detection):
               precision    recall  f1-score   support

           0       0.68      0.15      0.24       201
           1       0.82      0.98      0.89       794

    accuracy                           0.81       995
   macro avg       0.75      0.57      0.57       995
weighted avg       0.79      0.81      0.76       995



In [30]:
# Train SVM Model
svm_model = SVC(kernel="linear")
svm_model.fit(X_train_tfidf, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluation
print("SVM Results (SubTask1 - Attitude Detection):\n", classification_report(y_test, y_pred_svm))


SVM Results (SubTask1 - Attitude Detection):
               precision    recall  f1-score   support

           0       0.70      0.32      0.44       201
           1       0.85      0.96      0.90       794

    accuracy                           0.83       995
   macro avg       0.77      0.64      0.67       995
weighted avg       0.82      0.83      0.81       995



In [31]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Convert dataset into tokenized format
class CommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts  # Store text comments
        self.labels = labels  # Store labels (0,1,2 for classification)
    
    def __len__(self):
        return len(self.texts)  # Return total dataset size
    
    def __getitem__(self, idx):
        encoded_text = tokenizer(
            str(self.texts[idx]),  # Ensure text is a string
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {key: val.squeeze() for key, val in encoded_text.items()}, torch.tensor(self.labels[idx])

# ✅ Ensure `X_train` and `y_train` are in the correct format
if isinstance(X_train, pd.Series):  
    X_train = X_train.astype(str).tolist()  # Convert Series to list of strings
elif isinstance(X_train, list):  
    X_train = [str(text) for text in X_train]  # Ensure all values are strings

if isinstance(y_train, pd.Series):  
    y_train = y_train.tolist()  # Convert Series to list

# ✅ Remove any NaN values (if present)
X_train = [text if text else " " for text in X_train]  # Replace NaN with empty string

# Prepare dataset
train_dataset = CommentDataset(X_train, y_train)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Print a sample batch (For Debugging)
for batch in train_dataloader:
    print(batch)
    break


[{'input_ids': tensor([[  101,  1038, 10932,  ...,     0,     0,     0],
        [  101,  5292,  1046,  ...,     0,     0,     0],
        [  101,  6768, 24547,  ...,     0,     0,     0],
        ...,
        [  101,  5003,  2213,  ...,  2039,  2121,   102],
        [  101,  5378,  4424,  ...,     0,     0,     0],
        [  101,  8670,  4017,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, tensor([1, 1, 1, 0, 1, 1, 1, 0])]


In [32]:
import torch
from torch.optim import AdamW
from transformers import BertForSequenceClassification

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.train()  # Set model to training mode

# Define optimizer & loss function
optimizer = AdamW(model.parameters(), lr=5e-5)  # Learning rate = 5e-5
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import BertForSequenceClassification

# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from tqdm import tqdm  # Progress bar

epochs = 3  # Number of training epochs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    loop = tqdm(train_dataloader, leave=True)
    
    for batch in loop:
        optimizer.zero_grad()  # Reset gradients

        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch[1].to(device)  # Labels

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Loss: {loss.item():.4f}")


Epoch 1/3


  0%|          | 0/498 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not str