# Preliminaries

The following notebook contains code adapted from the Wrench tutorials (https://github.com/JieyuZ2/wrench.)

In [1]:
%load_ext autoreload
#%autoreload 2
import os

import wrench.wrench as wrnch
# import spacy
import numpy as np
import nltk

import logging
import torch

from wrench.wrench.dataset import load_dataset
from wrench.wrench.logging import LoggingHandler
from wrench.wrench.endmodel import MLPModel
from wrench.wrench.labelmodel import MajorityVoting, FlyingSquid
from typing import Any, List, Optional, Union, Callable

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [2]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension



Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# Creating Labeling Functions

First, we need to load our data in to the Wrench-provided pipeline. A set of prior labeling features are provided in the dataset to use in lieu of user-provided labeling functions; in order to use them, set extract_feature to True.

In [3]:
# Set up the location from which to load the data.
dataset_home = './data'
data = 'youtube'
#### Extract data features using pre-trained BERT model and cache it
extract_fn = 'bert'
model_name = 'bert-base-cased'

#Note: you can set extract_features to True if you'd like to use pre-set Labeling Function outputs.
train_data, valid_data, test_data = load_dataset(dataset_home, data, extract_feature=True, extract_fn=extract_fn,
                                                 cache_name=extract_fn, model_name=model_name)


2022-03-29 11:26:24 - loading data from data/youtube/train.json


  0%|          | 0/1686 [00:00<?, ?it/s]

2022-03-29 11:26:24 - loading data from data/youtube/valid.json


  0%|          | 0/120 [00:00<?, ?it/s]

2022-03-29 11:26:24 - loading data from data/youtube/test.json


  0%|          | 0/250 [00:00<?, ?it/s]

2022-03-29 11:26:24 - loading features from data/youtube/train_bert.pkl
2022-03-29 11:26:24 - loading features from data/youtube/valid_bert.pkl
2022-03-29 11:26:24 - loading features from data/youtube/test_bert.pkl


In [4]:
from synthesis import TEXT_SANDBOX_GRAMMAR, TextLabelFunction, enumerate_from_grammar
# defining lfs
def lf1(data):
    if any(word in data for word in ["<br","amp","Follow","check","channel","plz","PLEASE","OUT", "fuck", "out", "SUBSCRIBE" "Check","guys", "my","My", "subscribe", "http", "https", "please", "href", "money","$", "making", "per"]):
        return np.array(1)
    else:
        return np.array(-1)
# def lf2(data):
#     if "my channel" in data or "leave some" in data  or "EXTRAORDINARY website" in data or "You can make" in data or "make money online" in data:
#         return np.array(1)
#     else:
#         return np.array(-1)
# def lf3(data):
#     if "like this" in data :
#         return np.array(0)
#     else:
#         return np.array(-1)
    
def lf2(data):
    if any(word in data for word in ["!!!!","PSY","like", "song","video", "music" ]):
        return np.array(0)
    else:
        return np.array(-1)

# def lf5(data):
#     if any(word in data for word in ["PSY","like", "song" ]):
#         return np.array(0)
#     else:
#         return np.array(-1)

# def lf6(data):
#     if any(word in data for word in ["video", "music"]):
#         return np.array(0)
#     else:
#         return np.array(-1)
fxn_gnr = enumerate_from_grammar(TEXT_SANDBOX_GRAMMAR, TextLabelFunction)
synth_fxn1 = next(fxn_gnr)
synth_fxn2 = next(fxn_gnr)

Now, if you're going to manually generate and provide Labeling Functions, we're going to consider some ways of doing so:

In [7]:
#### Generate procedural labeling functions, using the LF generation defined by WRENCH.
from wrench.wrench.synthetic import ConditionalIndependentGenerator, NGramLFGenerator

#### Generate procedural labeling functions
# generator = NGramLFGenerator(dataset=train_data, min_acc_gain=0.1, min_support=0.01, ngram_range=(1, 2))
# applier = generator.generate(mode='correlated', n_lfs=10)
# L_test = applier.apply(test_data)
# L_train = applier.apply(train_data)
# print(len(train_data.examples))
# print(L_train.shape)

from labelfunction import LabelFunctionSet, RandomLFGenerator
# ## Use our API to include and apply custom labeling functions.
# LF_genr = RandomLFGenerator(num_functions=10, output_size=2) # 2 is the output size for binary classes
# # As a placeholder, we're just going to generate 'random' labeling functions.
# # If we want to use our own, we can just replace 'random_lfs' with our list of LFs!
# random_lfs = LF_genr.get_random_lfs()

LFSet = LabelFunctionSet()
#LFSet.add_function(lf1)
LFSet.add_function(synth_fxn2.generate_synthesized_function)


L_test = LFSet.apply_labels(test_data)
L_train = LFSet.apply_labels(train_data)

#### Evaluate label model on real-world dataset with semi-synthetic labeling functions
label_model = MajorityVoting()
label_model.fit(dataset_train=L_train, dataset_valid=valid_data)
target_value = label_model.test(test_data, metric_fn='acc')


In [8]:
target_value

0.84

#### Make sure you run a cell to generate the labeled data if you're providing manual functions!

# Downstream Model Training Pipeline

In [9]:
#### Generate soft training label via a label model
soft_label = label_model.predict_proba(train_data)
soft_label

array([[0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.5       , 0.5       ],
       ...,
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [10]:
#### Train a MLP classifier with soft label
device = torch.device('cpu')
n_steps = 100000
batch_size = 128
test_batch_size = 1000 
patience = 200
evaluation_step = 50
target='acc'

model = MLPModel(n_steps=n_steps, batch_size=batch_size, test_batch_size=test_batch_size)





In [None]:
# Let's actually train the model here.
history = model.fit(dataset_train=train_data, dataset_valid=valid_data, y_train=soft_label, 
                    device=device, metric=target, patience=patience, evaluation_step=evaluation_step)



[TRAIN] MLP Classifier:   0%|                                                                                 …

In [10]:
#### Evaluate the trained model
metric_value = model.test(test_data, target)
print(metric_value)

0.916


In [11]:
#### Evaluate the trained model
metric_value = model.test(test_data, target)
print(metric_value)

0.916
