In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader

# Load and preprocess the dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Prepare train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data["text"].fillna(""), train_data["target"], test_size=0.2, random_state=42
)

# Dataset class
class DisasterTweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = str(self.labels.iloc[idx]) if self.labels is not None else None
        inputs = self.tokenizer(
            text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        if label is not None:
            with self.tokenizer.as_target_tokenizer():
                labels = self.tokenizer(
                    label, max_length=2, padding="max_length", truncation=True, return_tensors="pt"
                )
            inputs["labels"] = labels["input_ids"].squeeze()
        return {key: val.squeeze() for key, val in inputs.items()}


In [12]:
# T5-Small Model Implementation

def train_t5_small():
    # Load model and tokenizer
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Prepare datasets
    train_dataset = DisasterTweetDataset(train_texts, train_labels, tokenizer)
    val_dataset = DisasterTweetDataset(val_texts, val_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Set up optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    for epoch in range(3):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            outputs = model.generate(**inputs)

            # Decode predictions and labels
            batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            batch_true_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Filter out empty or invalid predictions/labels
            batch_predictions = [pred.strip() for pred in batch_predictions if pred.strip().isdigit()]
            batch_true_labels = [true.strip() for true in batch_true_labels if true.strip().isdigit()]

            predictions.extend(batch_predictions)
            true_labels.extend(batch_true_labels)

    # Convert to integers
    predictions = list(map(int, predictions))
    true_labels = list(map(int, true_labels))

    # Compute F1 score
    if len(predictions) > 0 and len(true_labels) > 0:  # Ensure there are valid entries
        f1 = f1_score(true_labels, predictions)
        print(f"Epoch {epoch + 1}: F1 Score = {f1}")
    else:
        print("No valid predictions or labels were generated for evaluation.")


train_t5_small()


ValueError: Found input variables with inconsistent numbers of samples: [649, 663]

In [None]:
# T5-Base Model Implementation

def train_t5_base():
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    # The rest of the code is the same as T5-small implementation

train_t5_base()


In [None]:
# T5-Large Model Implementation

def train_t5_large():
    model_name = "t5-large"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    # The rest of the code is the same as T5-small implementation

Train_t5_large()


In [None]:
test_dataset = DisasterTweetDataset(test_data["text"].fillna(""), None, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        outputs = model.generate(**inputs)
        predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

submission = pd.DataFrame({"id": test_data["id"], "target": predictions})
submission["target"] = submission["target"].astype(int)
submission.to_csv("submission.csv_T5_small", index=False)
