Assume only EN train/val are available. Model will be evaluated on machine translated FR test set.

In [None]:
# import sys
# !{sys.executable} -m pip install wandb

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm, trange
import seaborn as sns; sns.set()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import init
from torch import optim

from sklearn.metrics import f1_score, precision_recall_fscore_support

In [None]:
np.random.seed(630)
torch.manual_seed(630)

<torch._C.Generator at 0x7d4ab3c63fd0>

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33manupath[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Dataset

## Train/val (EN)

In [None]:
train_df = pd.read_csv("../data/mtob_domain_en_fr_train.csv").drop("text_fr", axis=1)
val_df = pd.read_csv("../data/mtob_domain_en_fr_val.csv").drop("text_fr", axis=1)
len(train_df), len(val_df)

(11814, 1577)

In [None]:
id2label = train_df[["label", "label_text"]].drop_duplicates().set_index("label")["label_text"].to_dict()
id2label = dict(sorted(id2label.items(), key=lambda item: item[0]))
class_names = list(id2label.values())
print(f"{len(class_names)} classes:", class_names)

11 classes: ['messaging', 'calling', 'event', 'timer', 'music', 'weather', 'alarm', 'people', 'reminder', 'recipes', 'news']


## Test (MT FR)

The final model will be evaluated on the French test set that is machine translated to English using [NLLB-200](https://huggingface.co/docs/transformers/en/model_doc/nllb).

In [None]:
test_df = pd.read_csv("../data/mtob_domain_en2fr_nllb_test.csv")[["id", "label", "label_text", "text_fr2en"]]
len(test_df)

3193

# Preprocessing

In [None]:
def tokenize(text):
    return text.lower().split()

In [None]:
vocab_counts = defaultdict(int)
for text in train_df["text_en"].values:
    for term in tokenize(text):
        vocab_counts[term] += 1

vocab_counts = dict(sorted(vocab_counts.items(), key=lambda x: x[1], reverse=True))
print(f"No. of words in the training corpus: {len(vocab_counts)}")

No. of words in the training corpus: 8142


In [None]:
word2index = {w: i for i, w in enumerate(sorted(vocab_counts.keys()))}
index2word = {i: w for w, i in word2index.items()}

In [None]:
def build_term_document_matrix(documents, vocabulary, tokenize=tokenize):
    """
    documents: list[str]
    vocabulary: dict[str, int]
    """
    M = np.zeros((len(documents), len(vocabulary)), dtype=int)
    for i, doc in enumerate(documents):
        doc = tokenize(doc)
        for term in doc:
            if (j := vocabulary.get(term, -1)) >= 0:
                M[i, j] += 1

    return M


def prepare_dataset(documents, vocabulary, tokenize=tokenize, labels=None):
    X = torch.from_numpy(build_term_document_matrix(documents, vocabulary, tokenize)).to(dtype=torch.float)
    y = torch.from_numpy(labels).to(dtype=torch.int64) if labels is not None else None
    return X, y

In [None]:
X_train, y_train = prepare_dataset(documents=train_df["text_en"].values,
                                   vocabulary=word2index,
                                   labels=train_df["label"].values)

X_val, y_val = prepare_dataset(documents=val_df["text_en"].values,
                               vocabulary=word2index,
                               labels=val_df["label"].values)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

(torch.Size([11814, 8142]),
 torch.Size([11814]),
 torch.Size([1577, 8142]),
 torch.Size([1577]))

In [None]:
train_ds = TensorDataset(X_train, y_train)
val_ds = TensorDataset(X_val, y_val)

In [None]:
train_batch_size = 128
val_batch_size = 256
train_dataloader = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=val_batch_size, shuffle=False)

# Model

In [None]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def __call__(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [None]:
hidden_layer_size = 512
model = MultiLayerPerceptron(input_size=X_train.shape[1],
                             hidden_size=hidden_layer_size,
                             num_classes=len(class_names))

print(model)

MultiLayerPerceptron(
  (fc1): Linear(in_features=8142, out_features=512, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=512, out_features=11, bias=True)
)


# Training

In [None]:
learning_rate = 0.001
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
max_epochs = 20
logging_interval_loss = 100
logging_interval_score = 100

training_config = {
    "model": "MTOBDomainClassifier",
    "dataset": {
        "train": "mtob_domain_en_train",
        "val": "mtob_domain_en_val",
    },
    "vocabulary_size": X_train.shape[1],
    "hidden_layer_size": hidden_layer_size,
    "learning_rate": learning_rate,
    "train_batch_size": train_batch_size,
    "val_batch_size": val_batch_size,
    "epochs": max_epochs,
    "logging_intervals": {
        "train_loss": logging_interval_loss,
        "dev_score": logging_interval_score,
    },
    "device": device,
}

run_name = "mlp"
run = wandb.init(
    project="SI630-Project",
    name=run_name,
    config=training_config,
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
def predict(model, dataloader, device):
    true_labels = []
    pred_labels = []
    pred_scores = []

    with torch.no_grad():
        with tqdm(dataloader, unit="batches") as itr:
            for (inputs, labels) in itr:
                inputs = inputs.to(device)
                outputs = model(inputs)
                outputs = F.softmax(outputs, 1)
                scores_, labels_ = torch.max(outputs, 1)
                true_labels.append(labels.cpu().numpy())
                # pred_labels.append(torch.argmax(outputs, 1).cpu().numpy().astype(int))
                pred_labels.append(labels_.cpu().numpy())
                pred_scores.append(scores_.cpu().numpy())

    true_labels = np.concatenate(true_labels).squeeze()
    pred_labels = np.concatenate(pred_labels).squeeze()
    pred_scores = np.concatenate(pred_scores).squeeze()
    return true_labels, pred_labels, pred_scores

def evaluate(true_labels, pred_labels):
    precision, recall, micro_f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average="micro")
    macro_f1 = f1_score(true_labels, pred_labels, average="macro")
    weighted_f1 = f1_score(true_labels, pred_labels, average="weighted")
    return {"f1": micro_f1, "precision": precision, "recall": recall, "macro_f1": macro_f1, "weighted_f1": weighted_f1}

In [None]:
run_name = "mlp"
run_dir = os.path.join("../outputs/en_only", run_name)
os.makedirs(run_dir, exist_ok=True)
checkpoint_path = os.path.join(run_dir, "best_model.pth")

In [None]:
global_step = 0
if logging_interval_score:
    best_val_f1 = 0

model.to(device)
model.train(True)

for epoch in trange(max_epochs, unit="epochs"):
    losses = []

    with tqdm(train_dataloader, unit="batches") as itr:
        for global_step, (inputs, labels) in enumerate(itr, start=global_step + 1):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
            if logging_interval_loss and global_step % logging_interval_loss == 0:
                avg_loss = sum(losses) / len(losses)
                wandb.log({"train/global_step": global_step, "train/loss": avg_loss})
                losses = []

            if logging_interval_score and global_step % logging_interval_score == 0:
                model.eval()
                val_true, val_preds, _ = predict(model, val_dataloader, device)
                val_metrics = evaluate(val_true, val_preds)
                val_metrics = {f"eval/{k}": v for k, v in val_metrics.items()}
                wandb.log({"eval/global_step": global_step, **val_metrics})

                if (val_f1 := val_metrics["eval/f1"]) > best_val_f1:
                    best_val_f1 = val_f1
                    torch.save(model.state_dict(), checkpoint_path)

                model.train(True)

model.load_state_dict(torch.load(checkpoint_path))
model.eval()

# Predict

In [None]:
X_test, y_test = prepare_dataset(documents=test_df["text_fr2en"].values, vocabulary=word2index, labels=np.full(len(test_df), -1))
test_ds = TensorDataset(X_test, y_test)
test_dataloader = DataLoader(test_ds, batch_size=val_batch_size, shuffle=False)

_, test_preds, test_scores = predict(model, test_dataloader, device)
test_preds_df = test_df[["id"]]
test_preds_df["label"] = test_preds
test_preds_df["score"] = test_scores
test_preds_df.to_csv(os.path.join(run_dir, "translate_test_preds.csv"), index=False)

100%|██████████| 13/13 [00:04<00:00,  3.04batches/s]
