# **INSTALL AND IMPORT NECESSARY LIBRARIES**

In [None]:
# Install necessary packages
!pip install transformers datasets torch scikit-learn nltk unidecode emoji emoticon_fix

import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from unidecode import unidecode
import emoji
from transformers import EarlyStoppingCallback
from emoticon_fix import emoticon_fix
from imblearn.over_sampling import SMOTE
import torch.nn.functional as F

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting emoticon_fix
  Downloading emoticon_fix-0.0.2.tar.gz (3.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3

# **BERT TRAINING WITH PREPROCESSINGS**

In [None]:
# **BERT TRAINING WITH PREPROCESSINGS**

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
snow_stemmer = SnowballStemmer(language='english')

# Function to preprocess text
def preprocess_text(text):
    if pd.isna(text):  # Handle missing text
        return ""
    text = unidecode(text)  # Convert accented characters
    text = emoji.demojize(text)  # Convert emojis
    text = emoticon_fix.emoticon_fix(text)
    words = text.split()
    words = [snow_stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)

# Load dataset
file_path = "climaconvo.csv"
df = pd.read_csv(file_path)

# Rename columns for easier access
df.rename(columns={'Tweet': 'text', 'Hate Speech': 'label'}, inplace=True)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Keep only classes (0 and 1)
df = df[df['label'].isin([0, 1])]

# Train-test split (keep original class proportions)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define dataset class
class ClimateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt")
        self.labels = torch.tensor(list(labels), dtype=torch.long)  # FIXED: Convert Series to list

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Load data (Ensure df_train and df_test are defined)
X_train = df_train[['text']]
y_train = df_train['label']

# Apply oversampling to balance the dataset
ros = RandomOverSampler(sampling_strategy="auto", random_state=42)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train, y_train)

# Convert to lists for dataset creation
X_train_balanced = X_train_balanced['text'].tolist()
y_train_balanced = y_train_balanced.tolist()

# Compute class weights AFTER balancing
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train_balanced), y=y_train_balanced)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Create datasets
train_dataset = ClimateDataset(X_train_balanced, y_train_balanced, tokenizer)
val_dataset = ClimateDataset(df_test['text'], df_test['label'], tokenizer)

# Load pre-trained BERT model with increased dropout
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.config.hidden_dropout_prob = 0.3  # More dropout for generalization

# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=3e-4, momentum = 0.9)

# Define custom Trainer with Weighted Loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,  # Mixed precision for faster training
    report_to="none",
    save_on_each_node=False,
    disable_tqdm=False
)

# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement in 3 epochs
)

# Train model
trainer.train()

# Evaluate model
print("\nGenerating Classification Reports...\n" + "-" * 50)

# Train Set Classification Report
train_preds = trainer.predict(train_dataset).predictions
train_preds = np.argmax(train_preds, axis=1)
print("\nTrain Set Classification Report:")
print(classification_report(y_train_balanced, train_preds))

# Validation Set Classification Report
test_preds = trainer.predict(val_dataset).predictions
test_preds = np.argmax(test_preds, axis=1)
print("\nValidation Set Classification Report:")
print(classification_report(df_test['label'].tolist(), test_preds))

print("-" * 50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4168,0.378116,0.869837
2,0.3387,0.307075,0.878002
3,0.3568,0.315128,0.863112
4,0.3497,0.316927,0.85879
5,0.4335,0.306629,0.864553



Generating Classification Reports...
--------------------------------------------------



Train Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.92      0.85      7303
         1.0       0.91      0.75      0.82      7303

    accuracy                           0.83     14606
   macro avg       0.84      0.83      0.83     14606
weighted avg       0.84      0.83      0.83     14606




Validation Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93      1827
         1.0       0.50      0.64      0.56       255

    accuracy                           0.88      2082
   macro avg       0.72      0.78      0.75      2082
weighted avg       0.89      0.88      0.88      2082

--------------------------------------------------


# **FEATURE EXTRACTION+BiLSTM**

In [None]:
# Extract features from BERT
def extract_bert_features(model, dataloader, device):
    model.eval()
    features = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            features.append(hidden_states[:, 0, :].cpu().numpy())  # CLS token representation
    return np.vstack(features)

# Prepare DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Extract features
train_features = extract_bert_features(model, train_dataloader, device)
val_features = extract_bert_features(model, val_dataloader, device)

# Normalize features
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)

# --------------------------------
# 🔹 Define Nested BiLSTM Model
# --------------------------------

class NestedBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(NestedBiLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        # First BiLSTM layer
        self.bilstm1 = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True, dropout=dropout, bidirectional=True)

        # Subsequent BiLSTM layers
        self.bilstm_layers = nn.ModuleList([
            nn.LSTM(hidden_dim * 2, hidden_dim, num_layers=1, batch_first=True, dropout=dropout, bidirectional=True)
            for _ in range(num_layers - 1)
        ])

        # Fully connected layer (adjusted for bidirectional hidden size)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # Pass through first BiLSTM layer
        out, _ = self.bilstm1(x)

        # Pass through subsequent BiLSTM layers (nested)
        for bilstm in self.bilstm_layers:
            out, _ = bilstm(out)

        # Get the last time-step output
        out = out[:, -1, :]  # Take the last time step

        # Fully connected layer
        output = self.fc(out)
        return output

# --------------------------------
# 🔹 Define Dataset Class for LSTM
# --------------------------------

class LSTMFeatureDataset(Dataset):
    def __init__(self, features, labels):
        # Add an extra dimension for sequence length (1 in this case)
        self.features = torch.tensor(features, dtype=torch.float32).unsqueeze(1)  # Shape: (batch_size, 1, input_dim)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# --------------------------------
# 🔹 Main Pipeline
# --------------------------------
# Prepare Dataset for Nested BiLSTM
train_lstm_dataset = LSTMFeatureDataset(train_features, y_train_balanced)
val_lstm_dataset = LSTMFeatureDataset(val_features, df_test['label'].tolist())

train_loader = DataLoader(train_lstm_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_lstm_dataset, batch_size=batch_size, shuffle=False)

# Define parameters for Nested BiLSTM
bilstm_input_dim = train_features.shape[1]  # Feature size from BERT
bilstm_hidden_dim = 256
bilstm_output_dim = 2  # Multi-class classification (e.g., Support, Denial, Neutral)
bilstm_num_layers = 1  # Number of BiLSTM layers (nested)
bilstm_dropout = 0.5

# Initialize Nested BiLSTM model
nested_bilstm_model = NestedBiLSTM(bilstm_input_dim, bilstm_hidden_dim, bilstm_output_dim, bilstm_num_layers, bilstm_dropout).to(device)

# Define optimizer & loss function for BiLSTM
bilstm_criterion = nn.CrossEntropyLoss()
bilstm_optimizer = torch.optim.Adam(nested_bilstm_model.parameters(), lr=2e-5)

# Train Nested BiLSTM model
for epoch in range(12):
    nested_bilstm_model.train()
    total_loss = 0

    for batch_features, batch_labels in train_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        bilstm_optimizer.zero_grad()
        outputs = nested_bilstm_model(batch_features)
        loss = bilstm_criterion(outputs, batch_labels)
        loss.backward()
        bilstm_optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/12], Loss: {total_loss / len(train_loader):.4f}")

# --------------------------------
# 🔹 Evaluate Nested BiLSTM Model
# --------------------------------

nested_bilstm_model.eval()
all_features, all_labels = [], []

with torch.no_grad():
    for batch_features, batch_labels in val_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        # Get Nested BiLSTM output (features before classification layer)
        outputs = nested_bilstm_model(batch_features)

        # Collect features and labels
        all_features.extend(outputs.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

# Print classification report after Nested BiLSTM
print("\nValidation Set Classification Report (After Nested BiLSTM):")
print(classification_report(all_labels, np.argmax(all_features, axis=1)))



Epoch [1/12], Loss: 0.3710
Epoch [2/12], Loss: 0.3101
Epoch [3/12], Loss: 0.2874
Epoch [4/12], Loss: 0.2683
Epoch [5/12], Loss: 0.2507
Epoch [6/12], Loss: 0.2341
Epoch [7/12], Loss: 0.2184
Epoch [8/12], Loss: 0.2036
Epoch [9/12], Loss: 0.1895
Epoch [10/12], Loss: 0.1762
Epoch [11/12], Loss: 0.1639
Epoch [12/12], Loss: 0.1525

Validation Set Classification Report (After Nested BiLSTM):
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1827
           1       0.61      0.71      0.66       255

    accuracy                           0.91      2082
   macro avg       0.78      0.83      0.80      2082
weighted avg       0.92      0.91      0.91      2082



# **MLP**

In [None]:
# --------------------------------
# 🔹 MLP Training
# --------------------------------

# Train MLP on BiLSTM features
mlp_classifier = MLPClassifier(hidden_layer_sizes=(1024,512, 256), max_iter=1000, random_state=42, solver='adam', learning_rate_init=1e-3)
mlp_classifier.fit(all_features, all_labels)  # Train on features, not labels and preds

# Predictions from MLP
y_pred_mlp = mlp_classifier.predict(all_features)  # Predict on the features extracted from BiLSTM

# Classification Report
print("\nValidation Set Classification Report for MLP:")
print(classification_report(all_labels, y_pred_mlp))  # Compare true labels and predicted labels

# Save Predictions to CSV
df_predictions = pd.DataFrame({
    "Actual": all_labels,  # Actual true labels
    "Predicted": y_pred_mlp  # Predicted labels from MLP
})
prediction_file_path = "12HS_predictions_bert.csv"
df_predictions.to_csv(prediction_file_path, index=False)

print(f"Predictions saved as {prediction_file_path}!")


Validation Set Classification Report for MLP:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1827
           1       0.99      0.54      0.70       255

    accuracy                           0.94      2082
   macro avg       0.96      0.77      0.83      2082
weighted avg       0.95      0.94      0.94      2082

Predictions saved as 12HS_predictions_bert.csv!
