<a href="https://colab.research.google.com/github/adnanbaqi/nlp_based_disaster_tweets/blob/master/NLP_BASED_DISASTER_TWEET_CLASSIFICATIONSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

From here begins the model development

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install flash-attn
!pip install torch pandas numpy scikit-learn tqdm


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-e_k3mmzz
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-e_k3mmzz
  Resolved https://github.com/huggingface/transformers.git to commit 24c91f095fec4d90fa6901ef17146b4f4c21d0a3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm

# ModernBERT + GloVe Hybrid Model
class ModernBertGloveHybrid(nn.Module):
    def __init__(self, model_name='answerdotai/ModernBERT-base', freeze_bert=False):
        super(ModernBertGloveHybrid, self).__init__()

        # ModernBERT components
        self.bert = AutoModel.from_pretrained(model_name)
        self.bert_hidden_size = self.bert.config.hidden_size

        # GloVe components
        self.glove_dim = 100
        self.glove_embeddings = {}

        # Fusion and classification layers
        self.fusion = nn.Linear(self.bert_hidden_size + self.glove_dim, self.bert_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert_hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 2)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def initialize_glove(self, glove_path):
        """Initialize GloVe embeddings"""
        print("Loading GloVe embeddings...")
        if glove_path:
            try:
                with open(glove_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        values = line.split()
                        word = values[0]
                        vector = np.asarray(values[1:], dtype='float32')
                        self.glove_embeddings[word] = vector

                self.glove_embeddings['[PAD]'] = np.zeros(100)
                self.glove_embeddings['[UNK]'] = np.mean([vec for vec in self.glove_embeddings.values()], axis=0)
                print("GloVe embeddings loaded successfully!")
            except Exception as e:
                print(f"Error loading GloVe embeddings: {e}")
                raise FileNotFoundError("GloVe embeddings file not found, please provide a valid file path.")
        else:
            self.glove_embeddings = {'[PAD]': np.zeros(100), '[UNK]': np.random.randn(100)}
            print("Using random GloVe embeddings.")

    def get_glove_embedding(self, text):
        """Get GloVe embedding for a text"""
        words = text.lower().split()
        word_embeddings = []
        for word in words:
            if word in self.glove_embeddings:
                word_embeddings.append(self.glove_embeddings[word])
            else:
                word_embeddings.append(self.glove_embeddings['[UNK]'])

        if not word_embeddings:
            return torch.zeros(self.glove_dim, dtype=torch.float32)

        return torch.tensor(np.mean(word_embeddings, axis=0), dtype=torch.float32)

    def forward(self, input_ids, attention_mask, texts):
        # Get ModernBERT outputs
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        bert_pooled = bert_outputs.last_hidden_state[:, 0, :]

        # Process GloVe embeddings and ensure float32
        glove_embeddings = [self.get_glove_embedding(text) for text in texts]
        glove_embeddings = torch.stack(glove_embeddings).to(bert_pooled.device).to(bert_pooled.dtype)

        # Feature fusion
        combined_features = torch.cat([bert_pooled, glove_embeddings], dim=1)
        fused_features = self.fusion(combined_features)

        # Classification
        logits = self.classifier(fused_features)
        return logits

# Dataset Class
class DisasterTweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Main Classifier
class DisasterTweetClassifier:
    def __init__(self, model_name='answerdotai/ModernBERT-base', max_length=128):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = ModernBertGloveHybrid(model_name).to(self.device)
        self.max_length = max_length

        try:
            self.model.initialize_glove('glove.6B.100d.txt')
        except FileNotFoundError:
            print("GloVe embeddings file not found. Using random embeddings...")
            self.model.initialize_glove(None)

    def train(self, train_loader, val_loader, epochs=3, lr=2e-5):
        self.model.train()
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        best_val_acc = 0

        for epoch in range(epochs):
            # Training phase
            train_loss = 0
            train_correct = 0
            train_total = 0

            for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} - Training'):
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                texts = batch['text']

                outputs = self.model(input_ids, attention_mask, texts)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                _, predicted = outputs.max(1)
                train_total += labels.size(0)
                train_correct += predicted.eq(labels).sum().item()

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for batch in tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} - Validation'):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)
                    texts = batch['text']

                    outputs = self.model(input_ids, attention_mask, texts)
                    loss = criterion(outputs, labels)

                    val_loss += loss.item()
                    _, predicted = outputs.max(1)
                    val_total += labels.size(0)
                    val_correct += predicted.eq(labels).sum().item()

            train_acc = 100. * train_correct / train_total
            val_acc = 100. * val_correct / val_total

            print(f'\nEpoch {epoch+1}:')
            print(f'Training Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc:.2f}%')
            print(f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_acc:.2f}%')

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                torch.save(self.model.state_dict(), 'best_model.pt')
                print(f'New best model saved with validation accuracy: {val_acc:.2f}%')

    def predict(self, test_loader):
        self.model.eval()
        predictions = []

        with torch.no_grad():
            for batch in tqdm(test_loader, desc='Generating predictions'):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                texts = batch['text']

                outputs = self.model(input_ids, attention_mask, texts)
                predictions.extend(outputs.argmax(dim=1).cpu().numpy())

        return predictions

def preprocess_text(text):
    """Clean and preprocess tweet text"""
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        return ' '.join(text.split())
    return ''

def main():
    # Load and preprocess data
    print("Loading local dataset files...")
    try:
        train_df = pd.read_csv('nlp_dataset/train.csv')
        test_df = pd.read_csv('nlp_dataset/test.csv')
    except FileNotFoundError:
        print("Dataset files not found. Please check the paths.")
        return

    print("Preprocessing texts...")
    train_df['text'] = train_df['text'].apply(preprocess_text)
    test_df['text'] = test_df['text'].apply(preprocess_text)

    # Split training data
    print("Preparing training and validation splits...")
    if 'text' not in train_df or 'target' not in train_df:
        raise ValueError("Dataset missing required columns: 'text' or 'target'.")

    X_train, X_val, y_train, y_val = train_test_split(
        train_df['text'].values,
        train_df['target'].values,
        test_size=0.2,
        random_state=42,
        stratify=train_df['target']
    )

    # Initialize classifier
    print("Initializing ModernBERT classifier...")
    classifier = DisasterTweetClassifier()

    # Create datasets
    train_dataset = DisasterTweetDataset(X_train, y_train, classifier.tokenizer)
    val_dataset = DisasterTweetDataset(X_val, y_val, classifier.tokenizer)
    test_dataset = DisasterTweetDataset(
        test_df['text'].values,
        np.zeros(len(test_df)),
        classifier.tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Train model
    print("\nStarting model training...")
    classifier.train(train_loader, val_loader)

    # Generate predictions
    print("\nGenerating predictions...")
    predictions = classifier.predict(test_loader)

    # Save predictions
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'target': predictions
    })
    submission_df.to_csv('submission.csv', index=False)
    print("Predictions saved to: submission.csv")

if __name__ == "__main__":
    main()


Loading local dataset files...
Preprocessing texts...
Preparing training and validation splits...
Initializing ModernBERT classifier...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading GloVe embeddings...
GloVe embeddings loaded successfully!

Starting model training...


Epoch 1/3 - Training:   1%|          | 1/191 [01:19<4:11:03, 79.28s/it]