In [1]:
!pip install -U -q mlflow datasets>=2.14.5 nlp 2>/dev/null

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [3]:
##### Import Dependencies #####
import os
import torch
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, ClassLabel, load_metric
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

2024-03-07 16:31:31.611936: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 16:31:31.612028: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 16:31:31.734676: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
##### Read Data #####

# Read csv
df = pd.read_csv('/kaggle/input/wiki-detection/wiki-detection.csv')

# Print stats
print(f"df.shape: {df.shape}")
print(f"df.columns: {df.columns}")
print(df.head())


df.shape: (150000, 2)
df.columns: Index(['text', 'label'], dtype='object')
                                                text  label
0  Josefine Jakobsen (born 17 May 1991) is a Dani...      0
1  A cash crop or profit crop is an agricultural ...      0
2  The Lo Presti 'ndrina of Bardonecchia, known a...      0
3  John Tran (Vietnamese: Trần Diệc Tuyền; born N...      0
4  Euthyphro (; ; c. 399–395 BC), by Plato, is a ...      0


In [26]:
##### Create Dataset #####

# Class type
classes = [0, 1]

# Testing set fraction
test_fraction = 0.9

# Dataset
unsplitted_dataset = Dataset.from_pandas(df)

# Construct class label
ClassLabels = ClassLabel(num_classes=2, names=classes)
unsplitted_dataset = unsplitted_dataset.cast_column('label', ClassLabels)

# Split the dataset
dataset = unsplitted_dataset.train_test_split(test_size=test_fraction, shuffle=True, stratify_by_column='label')

Casting the dataset:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [27]:
##### Language Model related Setup #####

# Language model type
lm_type = 'bert-base-uncased'

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(lm_type)

# Model
lm_encoder = AutoModel.from_pretrained(lm_type)
lm_classifier = AutoModelForSequenceClassification.from_pretrained(lm_type, num_labels=2)

# Encode text data
def encode_data(example):
    encoding = tokenizer(example['text'], truncation=True)
    encoding['label'] = example['label']
    return encoding

dataset = dataset.map(encode_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/135000 [00:00<?, ? examples/s]

In [7]:
##### Defining Metric of Accuracy #####

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [28]:
##### Setup device #####

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lm_classifier = lm_classifier.to(device)

In [29]:
##### Language Model Training Setup #####

# Training hyperparameters
num_train_epochs = 1
learning_rate = 2e-7
train_batch_size = 8
eval_batch_size = 64
warmup_steps = 50
weight_decay = 0.02
output_dir = f"wiki-generated-intro-detection-{lm_type}"
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Defining training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    evaluation_strategy="epoch",
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir="./logs",
    num_train_epochs=num_train_epochs,
    save_steps=1000,
    save_total_limit=1,
    report_to="mlflow"
)

# Set trainer
trainer = Trainer(
    model=lm_classifier,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [30]:
##### Training and Evaluation #####

# Train
trainer.train()

# Evaluate
# results = trainer.evaluate()
# print(results)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0567,0.134925,0.976215


TrainOutput(global_step=1875, training_loss=0.10609483439127604, metrics={'train_runtime': 2659.4346, 'train_samples_per_second': 5.64, 'train_steps_per_second': 0.705, 'total_flos': 2572153121648160.0, 'train_loss': 0.10609483439127604, 'epoch': 1.0})

## Geberating LM Embeddings

In [12]:
##### Get Different Models #####

from functools import partial
from tqdm.notebook import tqdm

# Language model type

lm_type_list = [
#     'distilbert-base-uncased',
    'bert-base-uncased',
#     'microsoft/deberta-base',
]
    
# Encode text data
def encode_dataset(example, lm_tokenizer):
    encoding = lm_tokenizer(example['text'], truncation=True)
    encoding['label'] = example['label']
    return encoding


In [7]:
##### Setup device #####

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
##### Generating and Saving the Embedding #####

from torch.utils.data import DataLoader

num_samples = len(unsplitted_dataset)

for lm_name in lm_type_list:
    saving_name = lm_name.split('/')[-1]
    print(f"Dealing with {saving_name} ...")
    cur_tokenizer = AutoTokenizer.from_pretrained(lm_name)
    cur_model = AutoModel.from_pretrained(lm_name).to(device)

    encode_dataset_partial = partial(encode_dataset, lm_tokenizer=cur_tokenizer)
    cur_dataset = unsplitted_dataset.map(encode_dataset_partial)

    # Initialize a empty numpy array to save the embedding
    cls_embeddings = np.zeros((num_samples, cur_model.config.hidden_size))

    # Iterate the dataset
    for i, example in tqdm(enumerate(cur_dataset)):
        item = {}
        item['attention_mask'] = torch.IntTensor(np.array(example['attention_mask'])).unsqueeze(0).to(device)
        item['input_ids'] = torch.IntTensor(np.array(example['input_ids']).astype(np.int32)).unsqueeze(0).to(device)
        if saving_name not in ['distilbert-base-uncased']:
            item['token_type_ids'] = torch.IntTensor(np.array(example['token_type_ids'])).unsqueeze(0).to(device)

        with torch.no_grad():
            lm_outputs = cur_model(**item)
            emb = lm_outputs.last_hidden_state
            cls_embeddings[i] = emb.permute(1, 0, 2)[0].cpu().numpy()
    np.save(f'wiki-output/{saving_name}_embeddings.npy', cls_embeddings)

Dealing with deberta-base ...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

In [14]:
##### Save Label #####

np.save('wiki-output/wiki-label.npy', np.array(unsplitted_dataset['label']))

## 4. MLP with fixed LM embedding

In [15]:
##### Read the Data #####

"""
LM emb type:
'roberta-base',
'albert-base-v2',
'distilbert-base-uncased',
'bert-base-uncased',
'deberta-base',
"""
lm_type_list = [
    'distilbert-base-uncased',
    'bert-base-uncased',
    'deberta-base',
]
root_path = 'wiki-output/'
emb_path_list = [root_path + emb_type + '_embeddings.npy' for emb_type in lm_type_list]
label_path = root_path + 'wiki-label.npy'

# Read
emb_list = [np.load(emb_path) for emb_path in emb_path_list]
label = np.load(label_path)


In [18]:
##### Construct Dataset #####

# Import dependencies
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Construct a dataset
class EmbDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Dataset and split
N, D = emb_list[0].shape
emb_dataset_list = [EmbDataset(emb, label) for emb in emb_list]
emb_train_fraction = 0.8
emb_train_size = int(emb_train_fraction * N)
emb_valid_size = N - emb_train_size
emb_train_valid = [random_split(emb_dataset, [emb_train_size, emb_valid_size]) for emb_dataset in emb_dataset_list]



In [19]:
##### MLP model #####

# MLP model
class MLP(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        assert len(hidden_layers) > 0
        self.hidden_layers = nn.ModuleList([])
        for i, layer_size in enumerate(hidden_layers):
            if i == 0:
                self.hidden_layers.append(nn.Linear(input_size, layer_size))
            else:
                self.hidden_layers.append(nn.Linear(hidden_layers[i-1], layer_size))

        self.output_layer = nn.Linear(hidden_layers[-1], output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        for layer in self.hidden_layers:
            x = self.relu(layer(x))
        x = self.output_layer(x)
        return x


In [20]:
##### Initialize Model #####

# Model hyperparameters
input_size = D
output_size = 2
hidden_layers = [256, 128]

# Training hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 1e-3


In [29]:
##### Train and Eval #####

from sklearn.metrics import roc_auc_score

# Define eval
def eval(mlp_model, emb_valid_loader):
    mlp_model.eval()
    correct = 0
    total = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch_emb, batch_label in emb_valid_loader:
            batch_emb = batch_emb.to(device)
            batch_label = batch_label.to(device)
            outputs = mlp_model(batch_emb.float())
            _, predicted = torch.max(outputs.data, 1)
            total += batch_label.size(0)
            correct += (predicted == batch_label).sum().item()
            y_true.extend(batch_label.cpu().numpy())
            y_pred.extend(outputs[:, 1].cpu().numpy())
    print(f"Validation Accuracy: {100 * correct / total}%")
    print(f"Validation AUC: {roc_auc_score(y_true, y_pred)}")

# Define training
def train(mlp_model, emb_train_loader, emb_valid_loader, criterion, optimizer):
    for epoch in range(num_epochs):
        mlp_model.train()
        total_loss = 0
        for batch_emb, batch_label in emb_train_loader:
            batch_emb = batch_emb.to(device)
            batch_label = batch_label.to(device)
            optimizer.zero_grad()
            outputs = mlp_model(batch_emb.float())
            loss = criterion(outputs, batch_label)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(emb_train_loader)}")
        eval(mlp_model, emb_valid_loader)


In [30]:
##### Training #####

for i in range(3):
    print(f'Now dealing with {lm_type_list[i]}')
    # Model initialization
    mlp_model = MLP(input_size, output_size, hidden_layers).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(mlp_model.parameters(), lr=learning_rate)

    # Dataloader
    emb_train_loader = DataLoader(emb_train_valid[i][0], batch_size=batch_size, shuffle=True)
    emb_valid_loader = DataLoader(emb_train_valid[i][1], batch_size=batch_size, shuffle=False)
    
    # Train
    train(mlp_model, emb_train_loader, emb_valid_loader, criterion, optimizer)

Now dealing with distilbert-base-uncased
Epoch 1, Training Loss: 0.1609278850749135
Validation Accuracy: 95.58333333333333%
Validation AUC: 0.9926686803986947
Epoch 2, Training Loss: 0.10632828851789236
Validation Accuracy: 96.61666666666666%
Validation AUC: 0.995314733190388
Epoch 3, Training Loss: 0.09041621039013067
Validation Accuracy: 96.54%
Validation AUC: 0.9951538233133905
Epoch 4, Training Loss: 0.08168215119813879
Validation Accuracy: 95.77%
Validation AUC: 0.9963725542950944
Epoch 5, Training Loss: 0.07451102772740026
Validation Accuracy: 97.08666666666667%
Validation AUC: 0.9964761874738025
Epoch 6, Training Loss: 0.06651370524714391
Validation Accuracy: 97.09333333333333%
Validation AUC: 0.9967458506316598
Epoch 7, Training Loss: 0.06149057482307156
Validation Accuracy: 96.99%
Validation AUC: 0.9968226877202009
Epoch 8, Training Loss: 0.058821020236611364
Validation Accuracy: 97.16333333333333%
Validation AUC: 0.997116640251547
Epoch 9, Training Loss: 0.054662465528585016
