# Preliminaries

The following notebook contains code adapted from the Wrench tutorials (https://github.com/JieyuZ2/wrench.)

In [1]:
%load_ext autoreload
%autoreload 2
import wrench.wrench as wrnch
import spacy
import numpy as np
import nltk

import logging
import torch

from wrench.wrench.dataset import load_dataset
from wrench.wrench.logging import LoggingHandler
from wrench.wrench.endmodel import MLPModel
from wrench.wrench.labelmodel import MajorityVoting, FlyingSquid
from typing import Any, List, Optional, Union, Callable

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

# Creating Labeling Functions

First, we need to load our data in to the Wrench-provided pipeline. A set of prior labeling features are provided in the dataset to use in lieu of user-provided labeling functions; in order to use them, set extract_feature to True.

In [3]:
# Set up the location from which to load the data.
dataset_home = './data'
data = 'youtube'
#### Extract data features using pre-trained BERT model and cache it
extract_fn = 'bert'
model_name = 'bert-base-cased'

#Note: you can set extract_features to True if you'd like to use pre-set Labeling Function outputs.
train_data, valid_data, test_data = load_dataset(dataset_home, data, extract_feature=True, extract_fn=extract_fn,
                                                 cache_name=extract_fn, model_name=model_name)


2022-03-17 15:15:04 - loading data from data/youtube/train.json


  0%|          | 0/1686 [00:00<?, ?it/s]

2022-03-17 15:15:04 - loading data from data/youtube/valid.json


  0%|          | 0/120 [00:00<?, ?it/s]

2022-03-17 15:15:04 - loading data from data/youtube/test.json


  0%|          | 0/250 [00:00<?, ?it/s]

2022-03-17 15:15:04 - loading features from data/youtube/train_bert.pkl
2022-03-17 15:15:04 - loading features from data/youtube/valid_bert.pkl
2022-03-17 15:15:04 - loading features from data/youtube/test_bert.pkl


Now, if you're going to manually generate and provide Labeling Functions, we're going to consider some ways of doing so:

In [11]:
#### Generate procedural labeling functions, using the LF generation defined by WRENCH.
from wrench.wrench.synthetic import ConditionalIndependentGenerator, NGramLFGenerator

#### Generate procedural labeling functions
# generator = NGramLFGenerator(dataset=train_data, min_acc_gain=0.1, min_support=0.01, ngram_range=(1, 2))
# applier = generator.generate(mode='correlated', n_lfs=10)
# L_test = applier.apply(test_data)
# L_train = applier.apply(train_data)
# print(len(train_data.examples))
# print(L_train.shape)

from labelfunction import LabelFunctionSet, RandomLFGenerator
## Use our API to include and apply custom labeling functions.
LF_genr = RandomLFGenerator(num_functions=10, output_size=2) # 2 is the output size for binary classes
# As a placeholder, we're just going to generate 'random' labeling functions.
# If we want to use our own, we can just replace 'random_lfs' with our list of LFs!
random_lfs = LF_genr.get_random_lfs()
LFSet = LabelFunctionSet(initial_functions=random_lfs)
L_test = LFSet.apply_labels(test_data)
L_train = LFSet.apply_labels(train_data)

#### Evaluate label model on real-world dataset with semi-synthetic labeling functions
label_model = FlyingSquid()
label_model.fit(dataset_train=L_train, dataset_valid=valid_data)
target_value = label_model.test(test_data, metric_fn='auc')


#### Make sure you run a cell to generate the labeled data if you're providing manual functions!

# Downstream Model Training Pipeline

In [12]:
#### Generate soft training label via a label model
soft_label = label_model.predict_proba(train_data)
soft_label

array([[0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       ...,
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5]])

In [13]:
#### Train a MLP classifier with soft label
device = torch.device('cuda:0')
n_steps = 100000
batch_size = 128
test_batch_size = 1000 
patience = 200
evaluation_step = 50
target='acc'

model = MLPModel(n_steps=n_steps, batch_size=batch_size, test_batch_size=test_batch_size)

In [14]:
# Let's actually train the model here.
history = model.fit(dataset_train=train_data, dataset_valid=valid_data, y_train=soft_label, 
                    device=device, metric=target, patience=patience, evaluation_step=evaluation_step)

[TRAIN] MLP Classifier:   0%|                                                                                 …

2022-03-17 15:17:15 - [INFO] early stop @ step 12700!


In [15]:
#### Evaluate the trained model
metric_value = model.test(test_data, target)
print(metric_value)

0.608
