In [None]:
!pip install transformers torch scikit-learn pandas



In [None]:
import json
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import gc
import matplotlib.pyplot as plt

In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
def clear_cuda_memory():
    torch.cuda.empty_cache()
    gc.collect()

In [None]:

def preprocess_data():
    try:
        with open('/content/Sarcasm_Headlines_Dataset.json', 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    except FileNotFoundError:
        print("Error: Sarcasm_Headlines_Dataset.json file not found.")
        return None, None

    df = pd.DataFrame(data)
    df = df[['headline', 'is_sarcastic']]
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return df['headline'], df['is_sarcastic']


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [None]:
# Determine device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [None]:


# Load and preprocess data
headline_texts, y = preprocess_data()
if headline_texts is None:
    exit(1)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Split and tokenize data
train_texts, test_texts, y_train, y_test = train_test_split(
    headline_texts, y, test_size=0.2, random_state=42, stratify=y
)

# Tokenize data
train_encodings = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)
test_encodings = tokenizer(
    list(test_texts),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create datasets
train_dataset = SarcasmDataset(train_encodings, list(y_train))
test_dataset = SarcasmDataset(test_encodings, list(y_test))


In [None]:
def create_training_args(output_dir, logging_dir):
    return TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=logging_dir,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        save_total_limit=2,
        lr_scheduler_type="linear",
        logging_steps=50
    )



In [None]:
def train_models(num_models=3):
    trained_models = []

    for i in range(num_models):
        # Clear previous GPU cache
        clear_cuda_memory()

        # Reset model for each training
        model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=2
        ).to(device)

        # Create unique training arguments for each model
        training_args = create_training_args(
            f'/content/model_{i+1}',
            f'/content/logs_{i+1}'
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
        )

        # Train model
        print(f"Training Model {i+1}")
        trainer.train()

        # Evaluate and print results
        eval_results = trainer.evaluate()
        print(f"Evaluation Results for Model {i+1}: {eval_results}")

        # Save model
        trainer.save_model(f'/content/model_{i+1}')

        # Append the trained model to the list
        trained_models.append(model)

    return trained_models





In [None]:
def ensemble_predict(models, test_encodings, device, batch_size=32):
    predictions = []

    for model in models:
        model.eval()
        model_preds = []

        with torch.no_grad():
            for i in range(0, len(test_encodings['input_ids']), batch_size):
                batch = {key: val[i:i+batch_size].to(device) for key, val in test_encodings.items()}
                outputs = model(**batch)
                model_preds.append(outputs.logits)

        # Concatenate predictions for this model
        model_preds = torch.cat(model_preds, dim=0)
        predictions.append(model_preds)

    # Average predictions across models
    avg_predictions = torch.mean(torch.stack(predictions), dim=0)
    final_preds = torch.argmax(avg_predictions, dim=1).cpu().numpy()

    return final_preds

In [None]:
# Main execution
def main():
    # Train multiple models
    models = train_models()

    # Perform ensemble prediction
    y_pred = ensemble_predict(models, test_encodings, device)

    # Evaluate the ensemble model
    print("Ensemble Model Evaluation:")
    print("=" * 50)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {accuracy:.4f}")

    # Detailed Classification Report
    print("\nDetailed Classification Report:")
    print("=" * 50)

    # Print formatted classification report
    print(classification_report(y_test, y_pred,
                                target_names=['Non-Sarcastic', 'Sarcastic']))

# Run the main function
if __name__ == "__main__":
    main()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Model 1


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2441,0.237481,0.905835
2,0.1492,0.259853,0.924528
3,0.0884,0.344327,0.927498


Evaluation Results for Model 1: {'eval_loss': 0.34432679414749146, 'eval_accuracy': 0.927498252969951, 'eval_runtime': 19.2919, 'eval_samples_per_second': 296.705, 'eval_steps_per_second': 4.665, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Model 2


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2326,0.232669,0.910727
2,0.1345,0.242268,0.924528
3,0.0839,0.352253,0.925052


Evaluation Results for Model 2: {'eval_loss': 0.3522526025772095, 'eval_accuracy': 0.9250524109014675, 'eval_runtime': 19.2632, 'eval_samples_per_second': 297.147, 'eval_steps_per_second': 4.672, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Model 3


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2326,0.232669,0.910727
2,0.1345,0.242268,0.924528
3,0.0839,0.352253,0.925052


Evaluation Results for Model 3: {'eval_loss': 0.3522526025772095, 'eval_accuracy': 0.9250524109014675, 'eval_runtime': 19.0406, 'eval_samples_per_second': 300.621, 'eval_steps_per_second': 4.727, 'epoch': 3.0}
Ensemble Model Evaluation:
Overall Accuracy: 0.9258

Detailed Classification Report:
               precision    recall  f1-score   support

Non-Sarcastic       0.92      0.94      0.93      2997
    Sarcastic       0.93      0.91      0.92      2727

     accuracy                           0.93      5724
    macro avg       0.93      0.93      0.93      5724
 weighted avg       0.93      0.93      0.93      5724

