GPU TEST

In [1]:
import torch
print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


ModuleNotFoundError: No module named 'torch'

Requirements

In [None]:
#!pip install transformers pandas numpy scikit-learn torch matplotlib seaborn tqdm

Setting up the Environment

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from torch.optim import AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re

ModuleNotFoundError: No module named 'torch'

Data Preparation and Preprocessing

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Explore the data
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print("\nSample data:")
print(train_df.head())

# Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())

# Fill missing values
train_df['location'] = train_df['location'].fillna("unknown")
train_df['keyword'] = train_df['keyword'].fillna("unknown")
test_df['location'] = test_df['location'].fillna("unknown")
test_df['keyword'] = test_df['keyword'].fillna("unknown")

# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        # Remove hashtags symbol (but keep the text)
        text = re.sub(r'#', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return "empty"

train_df['clean_text'] = train_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

# Create train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_df[['clean_text', 'keyword', 'location']], 
    train_df['target'], 
    test_size=0.2, 
    random_state=42, 
    stratify=train_df['target']
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

Model Implementation

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last improvement.
            verbose (bool): If True, prints a message for each improvement.
            delta (float): Minimum change to qualify as an improvement.
            path (str): Path for the checkpoint to be saved to.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0 
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.path = path
        
    def __call__(self, val_loss, model):
        score = -val_loss  # Higher score is better

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0
            
    def save_checkpoint(self, val_loss, model):
        """Save model when validation loss decreases."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

# ---------------------- ADD THIS NEW FUNCTION FOR CLASS WEIGHTS ----------------------
# This helps address class imbalance by giving more weight to the minority class
def calculate_class_weights(y_train):
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)
    class_weights = total_samples / (len(class_counts) * class_counts)
    return torch.FloatTensor(class_weights)


In [None]:
# ---------------------- REPLACE YOUR DATASET CLASS WITH THIS ENHANCED VERSION ----------------------
class HybridBERTRoBERTaDataset(Dataset):
    def __init__(self, texts, keywords, locations, targets=None, bert_tokenizer=None, roberta_tokenizer=None, max_len=128):
        self.texts = texts
        self.keywords = keywords
        self.locations = locations
        self.targets = targets
        self.bert_tokenizer = bert_tokenizer
        self.roberta_tokenizer = roberta_tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        keyword = self.keywords.iloc[idx]
        location = self.locations.iloc[idx]
        
        # IMPROVED: Enhanced text combining - using special separators for better context
        # This helps the model understand the different parts of the input better
        combined_text = f"{text} [SEP] keyword: {keyword} [SEP] location: {location}"
        
        # BERT encoding
        bert_encoding = self.bert_tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # RoBERTa encoding
        roberta_encoding = self.roberta_tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            # RoBERTa doesn't use token_type_ids
            return_token_type_ids=False,   
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        item = {
            'bert_input_ids': bert_encoding['input_ids'].flatten(),
            'bert_attention_mask': bert_encoding['attention_mask'].flatten(),
            'bert_token_type_ids': bert_encoding['token_type_ids'].flatten(),
            'roberta_input_ids': roberta_encoding['input_ids'].flatten(),
            'roberta_attention_mask': roberta_encoding['attention_mask'].flatten(),
        }
        
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets.iloc[idx], dtype=torch.long)
            
        return item

class HybridBERTRoBERTaModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', roberta_model_name='roberta-base', num_classes=2, dropout_rate=0.3):
        super(HybridBERTRoBERTaModel, self).__init__()
        
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_dropout = nn.Dropout(dropout_rate)

        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.roberta_dropout = nn.Dropout(dropout_rate)

        bert_hidden_size = self.bert.config.hidden_size
        roberta_hidden_size = self.roberta.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + roberta_hidden_size, 768),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(768, 384),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(384, num_classes)
        )

        self.attention = nn.Sequential(
            nn.Linear(bert_hidden_size + roberta_hidden_size, 256),
            nn.Tanh(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def mean_pooling(self, last_hidden_state, attention_mask):
        # last_hidden_state: (batch_size, seq_len, hidden_size)
        # attention_mask: (batch_size, seq_len)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        return sum_embeddings / torch.clamp(sum_mask, min=1e-9)

    def forward(self, bert_input_ids, bert_attention_mask, bert_token_type_ids,
                roberta_input_ids, roberta_attention_mask):

        # BERT output with mean pooling
        bert_outputs = self.bert(
            input_ids=bert_input_ids,
            attention_mask=bert_attention_mask,
            token_type_ids=bert_token_type_ids,
            return_dict=True
        )
        bert_mean_pooled = self.mean_pooling(bert_outputs.last_hidden_state, bert_attention_mask)
        bert_mean_pooled = self.bert_dropout(bert_mean_pooled)

        # RoBERTa output with mean pooling
        roberta_outputs = self.roberta(
            input_ids=roberta_input_ids,
            attention_mask=roberta_attention_mask,
            return_dict=True
        )
        roberta_mean_pooled = self.mean_pooling(roberta_outputs.last_hidden_state, roberta_attention_mask)
        roberta_mean_pooled = self.roberta_dropout(roberta_mean_pooled)

        # Concatenate and apply attention
        concatenated = torch.cat((bert_mean_pooled, roberta_mean_pooled), dim=1)
        attention_weights = self.attention(concatenated)
        weighted_output = concatenated * attention_weights

        logits = self.classifier(weighted_output)
        return logits


Training Functions

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=3):
    # IMPROVED: Add early stopping
    early_stopping = EarlyStopping(patience=5, verbose=True, path='best_hybrid_model.pt')
    
    # IMPROVED: Calculate class weights to handle class imbalance
    targets = []
    for batch in train_dataloader:
        targets.extend(batch['targets'].numpy())
    class_weights = calculate_class_weights(targets)
    class_weights = class_weights.to(device)
    
    # IMPROVED: Use weighted loss function
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Training metrics history
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_precision': [],
        'val_recall': [],
        'val_f1': [],
        'val_f1_disaster': []  # IMPROVED: Track F1 specifically for the disaster class
    }
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 10)
        
        # Training phase
        model.train()
        running_loss = 0.0
        
        from tqdm import tqdm 
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            bert_input_ids = batch['bert_input_ids'].to(device)
            bert_attention_mask = batch['bert_attention_mask'].to(device)
            bert_token_type_ids = batch['bert_token_type_ids'].to(device)
            roberta_input_ids = batch['roberta_input_ids'].to(device)
            roberta_attention_mask = batch['roberta_attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(
                bert_input_ids=bert_input_ids,
                bert_attention_mask=bert_attention_mask,
                bert_token_type_ids=bert_token_type_ids,
                roberta_input_ids=roberta_input_ids,
                roberta_attention_mask=roberta_attention_mask
            )
            
            # Calculate loss
            loss = criterion(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            running_loss += loss.item()
            
        epoch_train_loss = running_loss / len(train_dataloader)
        history['train_loss'].append(epoch_train_loss)
        
        print(f"Training Loss: {epoch_train_loss:.4f}")
        
        # Validation phase
        model.eval()
        val_running_loss = 0.0
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for batch in val_dataloader:
                bert_input_ids = batch['bert_input_ids'].to(device)
                bert_attention_mask = batch['bert_attention_mask'].to(device)
                bert_token_type_ids = batch['bert_token_type_ids'].to(device)
                roberta_input_ids = batch['roberta_input_ids'].to(device)
                roberta_attention_mask = batch['roberta_attention_mask'].to(device)
                targets = batch['targets'].to(device)
                
                outputs = model(
                    bert_input_ids=bert_input_ids,
                    bert_attention_mask=bert_attention_mask,
                    bert_token_type_ids=bert_token_type_ids,
                    roberta_input_ids=roberta_input_ids,
                    roberta_attention_mask=roberta_attention_mask
                )
                
                loss = criterion(outputs, targets)
                val_running_loss += loss.item()
                
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(targets.cpu().numpy())
        
        # Calculate validation metrics
        epoch_val_loss = val_running_loss / len(val_dataloader)
        epoch_val_accuracy = accuracy_score(all_targets, all_preds)
        
        # IMPROVED: Get detailed metrics with class-specific results
        precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average='weighted')
        
        # IMPROVED: Get F1 score specifically for the disaster class (class 1)
        _, _, f1_disaster, _ = precision_recall_fscore_support(all_targets, all_preds, average=None)
        
        history['val_loss'].append(epoch_val_loss)
        history['val_accuracy'].append(epoch_val_accuracy)
        history['val_precision'].append(precision)
        history['val_recall'].append(recall)
        history['val_f1'].append(f1)
        history['val_f1_disaster'].append(f1_disaster[1] if len(f1_disaster) > 1 else 0)
        
        print(f"Validation Loss: {epoch_val_loss:.4f}")
        print(f"Validation Accuracy: {epoch_val_accuracy:.4f}")
        print(f"Validation Precision: {precision:.4f}")
        print(f"Validation Recall: {recall:.4f}")
        print(f"Validation F1: {f1:.4f}")
        print(f"Validation F1 (Disaster class): {f1_disaster[1] if len(f1_disaster) > 1 else 0:.4f}")
        
        # IMPROVED: Early stopping check
        early_stopping(epoch_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered!")
            break
            
    return model, history

def evaluate_model(model, test_dataloader, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            bert_input_ids = batch['bert_input_ids'].to(device)
            bert_attention_mask = batch['bert_attention_mask'].to(device)
            bert_token_type_ids = batch['bert_token_type_ids'].to(device)
            roberta_input_ids = batch['roberta_input_ids'].to(device)
            roberta_attention_mask = batch['roberta_attention_mask'].to(device)
            
            outputs = model(
                bert_input_ids=bert_input_ids,
                bert_attention_mask=bert_attention_mask,
                bert_token_type_ids=bert_token_type_ids,
                roberta_input_ids=roberta_input_ids,
                roberta_attention_mask=roberta_attention_mask
            )
            
            _, preds = torch.max(outputs, 1)
            predictions.extend(preds.cpu().numpy())
    
    return predictions

def plot_training_history(history):
    plt.figure(figsize=(15, 12))
    
    # Plot loss
    plt.subplot(3, 2, 1)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(3, 2, 2)
    plt.plot(history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot precision and recall
    plt.subplot(3, 2, 3)
    plt.plot(history['val_precision'], label='Validation Precision')
    plt.plot(history['val_recall'], label='Validation Recall')
    plt.title('Precision and Recall')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.legend()
    
    # Plot F1 score
    plt.subplot(3, 2, 4)
    plt.plot(history['val_f1'], label='Validation F1 (Overall)')
    if 'val_f1_disaster' in history:
        plt.plot(history['val_f1_disaster'], label='Validation F1 (Disaster Class)', linestyle='--')
    plt.title('F1 Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.legend()
    
    # IMPROVED: Add learning curves
    plt.subplot(3, 2, 5)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Learning Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.show()

Main

In [None]:
def main():
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == "cpu":
        print("GPU not available, using CPU.")
    print(f"Using device: {device}")
    
    # No of Epochs - INCREASED for better learning
    EPOCHS = 25
    
    # Initialize tokenizers
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    # Create datasets
    train_dataset = HybridBERTRoBERTaDataset(
        texts=X_train['clean_text'],
        keywords=X_train['keyword'],
        locations=X_train['location'],
        targets=y_train,
        bert_tokenizer=bert_tokenizer,
        roberta_tokenizer=roberta_tokenizer
    )
    
    val_dataset = HybridBERTRoBERTaDataset(
        texts=X_val['clean_text'],
        keywords=X_val['keyword'],
        locations=X_val['location'],
        targets=y_val,
        bert_tokenizer=bert_tokenizer,
        roberta_tokenizer=roberta_tokenizer
    )
    
    test_dataset = HybridBERTRoBERTaDataset(
        texts=test_df['clean_text'],
        keywords=test_df['keyword'],
        locations=test_df['location'],
        targets=None,
        bert_tokenizer=bert_tokenizer,
        roberta_tokenizer=roberta_tokenizer
    )
    
    # IMPROVED: Adjusted batch sizes for better training
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    # Initialize model with improved architecture
    model = HybridBERTRoBERTaModel()
    model.to(device)
    
    # IMPROVED: Better optimizer settings
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, eps=1e-8)
    
    # IMPROVED: Use proper learning rate scheduler with warmup
    total_steps = len(train_dataloader) * EPOCHS
    warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=warmup_steps, 
        num_training_steps=total_steps
    )
    
    # Train model
    model, history = train_model(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,   
        epochs=EPOCHS
    )
    
    # Plot training history
    plot_training_history(history)
    
    # Load best model
    model.load_state_dict(torch.load('best_hybrid_model.pt'))
    
    # Get predictions on validation set for detailed evaluation
    val_predictions = evaluate_model(model, val_dataloader, device)
    print("\nDetailed Validation Metrics:")
    print(classification_report(y_val, val_predictions))
    
    # Create confusion matrix
    cm = confusion_matrix(y_val, val_predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('confusion_matrix.png')
    plt.show()
    
    # IMPROVED: Analyze specific misclassifications
    misclassified_indices = np.where(np.array(val_predictions) != np.array(y_val))[0]
    if len(misclassified_indices) > 0:
        print("\nSample of misclassified tweets:")
        sample_size = min(10, len(misclassified_indices))
        sampled_indices = np.random.choice(misclassified_indices, sample_size, replace=False)
        
        for idx in sampled_indices:
            text = X_val['clean_text'].iloc[idx]
            true_label = y_val.iloc[idx]
            pred_label = val_predictions[idx]
            print(f"Text: {text}")
            print(f"True: {true_label}, Predicted: {pred_label}")
            print("-" * 50)
    
    # Get predictions on test set
    test_predictions = evaluate_model(model, test_dataloader, device)
    
    # Create submission file
    submission = pd.DataFrame({
        'id': test_df['id'],
        'target': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    print("\nTest predictions saved to 'submission.csv'")
    
if __name__ == "__main__":
    main()