In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, ElectraForSequenceClassification
import pandas as pd
import rubrix as rb
from rubrix.labeling.text_classification import Rule, WeakLabels
import re
from tqdm.auto import tqdm
from weasel.datamodules.base_datamodule import AbstractWeaselDataset, AbstractDownstreamDataset
from weasel.models.downstream_models.base_model import DownstreamBaseModel
from torch.utils.data import DataLoader
from typing import Union, List
import numpy as np
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

## read data

In [2]:
train_df = pd.read_csv('../tutorials/data/yt_comments_train.csv')
test_df = pd.read_csv('../tutorials/data/yt_comments_test.csv')

## upload records

In [3]:
rb.delete("weak_supervision_yt")

In [4]:
# unlabelled data
records = [
    rb.TextClassificationRecord(
        inputs=row.text,
        metadata={"video":row.video, "author": row.author},
        id=i,
    )
    for i,row in train_df.iterrows()
]

# labelled data for testing
last = len(records)
labels = ["HAM", "SPAM"]


records += [
    rb.TextClassificationRecord(
        inputs=row.text,
        annotation=labels[row.label],
        metadata={"video":row.video, "author": row.author},
        id=last+i
    )
    for i, row in test_df.iterrows()
]
rb.log(records, name="weak_supervision_yt")

BulkResponse(dataset='weak_supervision_yt', processed=1836, failed=0)

## define rules

In [5]:
# Rules defined as Elasticsearch queries
check_out = Rule(query="check out", label="SPAM")
plz = Rule(query="plz OR please", label="SPAM")
subscribe = Rule(query="subscribe", label="SPAM")
my = Rule(query="my", label="SPAM")
song = Rule(query="song", label="HAM")
love = Rule(query="love", label="HAM")

# Rules defined as Python labeling functions
def contains_http(record: rb.TextClassificationRecord):
    if "http" in record.inputs["text"]:
        return "SPAM"

def short_comment(record: rb.TextClassificationRecord):
    return "HAM" if len(record.inputs["text"].split()) < 5 else None

def regex_check_out(record: rb.TextClassificationRecord):
    return "SPAM" if re.search(r"check.*out", record.inputs["text"], flags=re.I) else None

## compute weak labels

In [6]:
# bundle our rules in a list
rules = [check_out, plz, subscribe, my, song, love, contains_http, short_comment, regex_check_out]

# apply the rules to a dataset to obtain the weak labels
weak_labels = WeakLabels(
    rules=rules, 
    dataset="weak_supervision_yt"
)

# show some stats about the rules, see the `summary()` docstring for details
weak_labels.summary()

Preparing rules:   0%|          | 0/9 [00:00<?, ?it/s]

Applying rules:   0%|          | 0/1836 [00:00<?, ?it/s]

Unnamed: 0,polarity,coverage,overlaps,conflicts,correct,incorrect,precision
check out,{SPAM},0.242919,0.235839,0.029956,45,0,1.0
plz OR please,{SPAM},0.090414,0.081155,0.019608,20,0,1.0
subscribe,{SPAM},0.106754,0.083878,0.028867,30,0,1.0
my,{SPAM},0.190632,0.166667,0.049564,41,6,0.87234
song,{HAM},0.132898,0.079521,0.033769,39,9,0.8125
love,{HAM},0.092048,0.070261,0.03159,28,7,0.8
contains_http,{SPAM},0.106209,0.073529,0.049564,6,0,1.0
short_comment,{HAM},0.245098,0.110566,0.06427,84,8,0.913043
regex_check_out,{SPAM},0.22658,0.226035,0.027778,45,0,1.0
total,"{SPAM, HAM}",0.754902,0.448802,0.120915,338,30,0.918478


## baseline: snorkel's LabelModel

In [8]:
from snorkel.labeling.model import LabelModel

# train our label model
label_model = LabelModel()
label_model.fit(L_train=weak_labels.matrix(has_annotation=False), n_epochs=500, log_freq=100, seed=123)

# check its performance
label_model.score(L=weak_labels.matrix(has_annotation=True), Y=weak_labels.annotation())



{'accuracy': 0.9043062200956937}

## weasel: define datasets and dataloaders

In [9]:
class TrainDataset(AbstractWeaselDataset):
    def __init__(self, L: Union[np.ndarray, torch.Tensor], inputs):
        super().__init__(L, None)
        self.inputs = inputs
        
        if self.L.shape[0] != len(self.inputs):
            raise ValueError("L and inputs have different number of samples")
        
    def __getitem__(self, item):
        return self.L[item], self.inputs[item]
    

class TestDataset(AbstractDownstreamDataset):
    def __init__(self, inputs, Y: Union[np.ndarray, torch.Tensor]):
        super().__init__(None, Y)
        self.inputs = inputs
        
        if len(self.Y) != len(self.inputs):
            raise ValueError("inputs and Y have different number of samples")
        
    def __getitem__(self, item):
        return self.inputs[item], self.Y[item]

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")

In [11]:
train_ds = TrainDataset(
    L=weak_labels.matrix(has_annotation=False), 
    inputs=[tokenizer(rec.inputs["text"], truncation=True) 
          for rec in weak_labels.records(has_annotation=False)], 
)

test_ds = TestDataset(
    inputs=[tokenizer(rec.inputs["text"], truncation=True)
          for rec in weak_labels.records(has_annotation=True)],
    Y=weak_labels.annotation(),
)

In [15]:
class TrainCollator:
    def __init__(self, tokenizer):
        self._tokenizer = tokenizer
    def __call__(self, batch):
        L = torch.stack([b[0] for b in batch])
        inputs = {key: [b[1][key] for b in batch] for key in batch[0][1]}
        return L, self._tokenizer.pad(inputs, return_tensors="pt")

    
class TestCollator:
    def __init__(self, tokenizer):
        self._tokenizer = tokenizer
    def __call__(self, batch):
        Y = torch.stack([b[1] for b in batch])
        inputs = {key: [b[0][key] for b in batch] for key in batch[0][0]}
        return self._tokenizer.pad(inputs, return_tensors="pt"), Y

train_loader = DataLoader(
    dataset=train_ds,
    collate_fn=TrainCollator(tokenizer),
    batch_size=8,
)

test_loader = DataLoader(
    dataset=test_ds,
    collate_fn=TestCollator(tokenizer),
    batch_size=8,
)

## define end model

In [21]:
class TransformerEndModel(DownstreamBaseModel):
    def __init__(self, name: str = "google/electra-small-discriminator", num_labels: int = 2):
        super().__init__()
        self.out_dim = num_labels
        self.model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_labels)
        
    def forward(self, kwargs):
        model_output = self.model(**kwargs)
        return model_output["logits"]

In [22]:
end_model = TransformerEndModel()

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

## join end model and weasel's label model 

In [23]:
from weasel.models import Weasel
weasel = Weasel(
    end_model=end_model,
    num_LFs=len(weak_labels.rules),
    n_classes=2,
    encoder={'hidden_dims': [32, 10]},
    optim_encoder={'name': 'adam', 'lr': 1e-4},
    optim_end_model={'name': 'adam', 'lr': 5e-5}  # different way of getting the same optim with Hydra
)

## train the joint model

In [24]:
checkpoint_callback = ModelCheckpoint(monitor="Val/accuracy", mode="max")

trainer = pl.Trainer(
    gpus=1,  # >= 1 to use GPU(s)
    max_epochs=3,  # since just for illustratory purposes
    logger=False,
    deterministic=True,
    callbacks=[checkpoint_callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [25]:
trainer.fit(model=weasel, train_dataloaders=train_loader, val_dataloaders=test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                | Params
------------------------------------------------------
0 | end_model     | TransformerEndModel | 13.5 M
1 | encoder       | MLPEncoder          | 932   
2 | accuracy_func | Softmax             | 0     
------------------------------------------------------
13.6 M    Trainable params
0         Non-trainable params
13.6 M    Total params
54.201    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]