In [5]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import numpy as np
import torch
import os
import wandb

from covid_voices.data.datasets.corona_dataset import CoronaTweetDataset


%load_ext autoreload
%autoreload 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def preprocess_tweet(text):
        """Clean and normalize tweet text"""
        # Example preprocessing - you can expand this
        text = text.lower()
        text = text.replace('#', 'hashtag_')
        text = text.replace('@', 'mention_')
        return text
    
datasets = CoronaTweetDataset.load_datasets(preprocessing=preprocess_tweet)
datasets


{'train': <covid_voices.data.datasets.corona_dataset.CoronaTweetDataset at 0x706df5561f20>,
 'test': <covid_voices.data.datasets.corona_dataset.CoronaTweetDataset at 0x706df49559d0>}

In [9]:
datasets['train'][0]

{'text': 'mention_menyrbie mention_phil_gahan mention_chrisitv https://t.co/ifz9fan2pa and https://t.co/xx6ghgfzcc and https://t.co/i2nlzdxno8',
 'label': tensor(2)}

In [None]:

# Configuration
model_names = [
     "distilbert-base-uncased",
    "huawei-noah/TinyBERT_General_4L_312D",
]
num_labels = 5
max_length = 128
batch_size = 128
output_base_dir = "./models/"
project_name = "corona-nlp-ensemble"

# Label mapping
label2id = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
id2label = {v: k for k, v in label2id.items()}

# Load the dataset from local CSV

dataset = load_dataset("csv", data_files={
    "train": "data/Corona_NLP_train.csv",
    "test": "data/Corona_NLP_test.csv"
},
                       encoding="latin1"
)

# Add encoded label column
def encode_labels(example):
    example["label"] = label2id[example["Sentiment"]]
    return example

dataset = dataset.map(encode_labels)

# Preprocessing function
def preprocess(example, tokenizer):
    return tokenizer(
        example["OriginalTweet"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()
    return {"accuracy": accuracy}


split = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Train each model separately
for i, model_name in enumerate(model_names):
    print(f"Training model {i+1}/{len(model_names)}: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized = split.map(lambda x: preprocess(x, tokenizer), batched=True)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    wandb.init(project=project_name, name=f"model_{i}_{model_name.replace('/', '_')}", reinit=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir="./test_output",
        eval_strategy="epoch",
        per_device_train_batch_size=128,
        num_train_epochs=5,
        do_train=True,
        do_eval=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(os.path.join(output_base_dir, f"model_{i}"))
    tokenizer.save_pretrained(os.path.join(output_base_dir, f"model_{i}"))
    wandb.finish()

print("All ensemble models trained and saved.")

In [None]:
import torch
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from torch.nn import functional as F
from torch import nn
import numpy as np

raw_datasets = load_dataset("csv",data_files={"train": "data/Corona_NLP_train.csv", "test": "data/Corona_NLP_test.csv"}, encoding="latin1")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(
        batch["OriginalTweet"],
        padding="max_length",
        truncation=True,
        max_length=max_length)

tokenized = raw_datasets.map(tokenize, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask",
"Sentiment"])

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5, id2label=label2id, label2id=label2id)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()
    return {"accuracy": accuracy}

args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=1,
logging_dir="./logs",
logging_steps=10,
save_strategy="no"
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
compute_metrics=compute_metrics
)

# Best Practices for Handling Train and Test Datasets

There are three main approaches to handling train and test datasets in NLP projects:

1. **Creating separate dataset objects** - Using separate instances for train and test
2. **Using a dataset class with split parameters** - Single class that loads different data based on 'split' parameter
3. **Using dataset splitting methods** - Loading all data and then splitting programmatically

Our improved `CoronaTweetDataset` class supports all three approaches for maximum flexibility.

In [None]:
# Example of using the recommended approach in a complete training pipeline

try:
    from datasets.corona_dataset import CoronaTweetDataset
    import torch
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    
    # 1. Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    
    # 2. Define a preprocessing function (optional)
    def preprocess_tweet(text):
        """Clean and normalize tweet text"""
        # Example preprocessing - you can expand this
        text = text.lower()
        text = text.replace('#', 'hashtag_')
        text = text.replace('@', 'mention_')
        return text
    
    # 3. Use factory method to load both datasets with consistent parameters
   
    
    # 4. Access individual datasets
    train_dataset = datasets["train"]
    test_dataset = datasets["test"]
    
    # 5. Create data loaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=4
    )
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=64,
        shuffle=False,
        num_workers=4
    )
    
    # 6. Display dataset information
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    # 7. Show a batch example
    print("\nExample batch from training data:")
    for batch in train_loader:
        for key, value in batch.items():
            if isinstance(value, torch.Tensor):
                print(f"{key} shape: {value.shape}")
            else:
                print(f"{key}: {value}")
        break  # Just show the first batch
        
except (ImportError, FileNotFoundError) as e:
    print(f"Error: {e}")
    print("Make sure the datasets module is accessible and data files exist.")