# Binary-class Experiments

In [None]:
import pandas as pd

from witan_experiments import (is_cached,
                               save_to_cache,
                               load_from_cache,
                               run_experiments)
from witan_experiments.evaluation import (summarise_experiments,
                                          metric_line_grid,
                                          build_metric_df,
                                          display_metric_table,
                                          display_friedman_test,
                                          median_stds_df)
from witan_experiments.config import prepare_experiment_configs
from witan_experiments.rule_seeders import BlankRS, AccRS
from witan_experiments.rule_generators import (TrueRG,
                                               IWSBinaryRG,
                                               WitanRG,
                                               SnubaRG,
                                               SemiSupervisedRG,
                                               ActiveLearningRG,
                                               RandomLabellingRG,
                                               CbiRG,
                                               HdcRG)
from witan_experiments.labellers import SnorkelLblr
from witan_experiments.models import AnnClf
from witan_experiments.utils import inverse_dict

In [None]:
ruleset_generators = {
    'Full supervision': TrueRG(),
    'Wɪᴛᴀɴ': WitanRG(),
    'Wɪᴛᴀɴ-Core': WitanRG(a=False, o=1),
    'IWS-AS': IWSBinaryRG(acq='AS'),
    'IWS-LSE-AC': IWSBinaryRG(acq='LSE'),
    'Snuba': SnubaRG(),
    'HDC': HdcRG(),
    'CBI': CbiRG(clf=AnnClf()),
    'Semi-supervised': SemiSupervisedRG(),
    'Active learning': ActiveLearningRG(clf=AnnClf(), init_count=0),
    'Random labelling': RandomLabellingRG(),
}

base_config = dict(
    rule_seeder=[BlankRS()],
    rngseed=[1, 2, 3, 4, 5],
    labeller=[SnorkelLblr()],
    classifier=[AnnClf()],
)

parallel_configs = [
    # Unseeded ruleset_generators that are not affected by rngseed
    # (so we only need to execute them for the first rngseed)
    {
        **base_config,
        **dict(
            rngseed=base_config['rngseed'][:1],
            ruleset_generator=[
                ruleset_generators['Full supervision'],
                ruleset_generators['Wɪᴛᴀɴ'],
                ruleset_generators['Wɪᴛᴀɴ-Core'],
                ruleset_generators['HDC'],
            ],
        ),
    },
    # Unseeded ruleset_generators
    {
        **base_config,
        **dict(
            ruleset_generator=[
                ruleset_generators['Snuba'],
                ruleset_generators['Semi-supervised'],
                ruleset_generators['Active learning'],
                ruleset_generators['Random labelling'],
            ],
        ),
    },
    # Seeded ruleset_generators
    {
        **base_config,
        **dict(
            rule_seeder=[AccRS()],
            ruleset_generator=[
                ruleset_generators['Wɪᴛᴀɴ'],
                ruleset_generators['Wɪᴛᴀɴ-Core'],
                ruleset_generators['CBI'],
            ],
        ),
    },
]
serial_configs = [
    # Unseeded ruleset_generators
    {
        **base_config,
        **dict(
            ruleset_generator=[
                ruleset_generators['IWS-AS'],
                ruleset_generators['IWS-LSE-AC'],
            ],
        ),
    },
    # Seeded ruleset_generators
    {
        **base_config,
        **dict(
            rule_seeder=[AccRS()],
            ruleset_generator=[
                ruleset_generators['IWS-AS'],
                ruleset_generators['IWS-LSE-AC'],
            ],
        ),
    },
]

full_ic = [10, 25, 50, 100, 150, 200]
min_ic = [25, 100]
dataset_to_ic = {
    'imdb': full_ic,
    'imdb_genre': min_ic,
    'bias_pa': full_ic,
    'bias_pt': min_ic,
    'bias_jp': min_ic,
    'bias_pp': min_ic,
    'amazon': min_ic,
    'yelp': min_ic,
    'plots': min_ic,
    'fakenews': full_ic,
    'binary_dbpedia': min_ic,
    'binary_agnews': full_ic,
    'airline_tweets': min_ic,
    'damage': min_ic,
    'spam': min_ic,
}

dataset_configs = {
    dataset: {
        'parallel_configs': [
            experiment_config
            for parallel_config in parallel_configs
            for experiment_config in prepare_experiment_configs(
                **parallel_config,
                dataset_name=[dataset],
                interaction_count=ic,
            )
        ],
        'serial_configs': [
            experiment_config
            for serial_config in serial_configs
            for experiment_config in prepare_experiment_configs(
                **serial_config,
                dataset_name=[dataset],
                interaction_count=ic,
            )
        ],
    }
    for dataset, ic in dataset_to_ic.items()
}

In [None]:
CACHE_KEY = 'binary-experiments'
CONTINUE_ON_FAILURE = False

if not is_cached(CACHE_KEY):
    dfs = []
    for dataset, configs in dataset_configs.items():
        print(f'\nRunning experiments for: {dataset}')
        # Run resource-heavy ruleset_generators in serial,
        # but allow some parallelism when not running full_ic
        serial_rule_workers = 1 if (dataset_to_ic[dataset] == full_ic) else 2
        dataset_serial_results = run_experiments(
            configs['serial_configs'],
            default_workers=2,
            rule_workers=serial_rule_workers,
            continue_on_failure=CONTINUE_ON_FAILURE,
        )
        dfs.append(summarise_experiments(dataset_serial_results, workers=8))
        # Run lightweight ruleset_generators with more parallelism
        dataset_parallel_results = run_experiments(
            configs['parallel_configs'],
            default_workers=2,
            rule_workers=4,
            continue_on_failure=CONTINUE_ON_FAILURE,
        )
        dfs.append(summarise_experiments(dataset_parallel_results, workers=8))
    df = pd.concat(dfs)
    save_to_cache(CACHE_KEY, df)

df = load_from_cache(CACHE_KEY)

In [None]:
table_df = df[df['interaction_count'].isin([25, 100])].copy()

# Use friendly seeded/unseeded ruleset_generator names
rg_label_map = inverse_dict(ruleset_generators)

def method_label(row):
    if row['rule_seeder'] == BlankRS():
        seeding = ''
    elif row['rule_seeder'] == AccRS():
        seeding = 'Seeded '
    else:
        raise ValueError('Unknown rule_seeder')
    rg_label = rg_label_map[row['ruleset_generator']]
    return f'{seeding}{rg_label}'

table_df['method'] = table_df.apply(method_label, axis=1)

# Apply different filters to results.
full_supervision_table_df = table_df[table_df['ruleset_generator'].isin([ruleset_generators['Full supervision']])]
table_df = table_df[
    ~table_df['ruleset_generator'].isin([ruleset_generators['Full supervision']])
]
unseeded_table_df = table_df[table_df['rule_seeder'] == BlankRS()]
seeded_table_df = table_df[table_df['rule_seeder'] == AccRS()]

## F1 Score Results

In [None]:
legend_label_suffix = '  '
fig = metric_line_grid(
    df[df['dataset_name'].isin(['imdb', 'bias_pa', 'fakenews', 'binary_agnews'])],
    metric='test_macro_f1',
    facet_row='dataset_name',
    facet_col='rule_seeder',
    ruleset_generators=ruleset_generators,
    legend_label_suffix=legend_label_suffix,
    category_orders={
        'dataset_name': ['IMD', 'BPA', 'FNS', 'BAG'],
        'rule_seeder': ['Unseeded', 'Seeded'],
        'ruleset_generator': [rg + legend_label_suffix for rg in ruleset_generators.keys()],
    },
)
fig.write_image('plots/binary-f1-lines.svg')
fig.show()

### Unseeded F1 Scores

In [None]:
unseeded_f1_df = build_metric_df(pd.concat([full_supervision_table_df, unseeded_table_df]),
                                 method='ruleset_generator',
                                 metric='test_macro_f1',
                                 labelled_methods=ruleset_generators)
table = display_metric_table(unseeded_f1_df, rank_excluded_methods=['Full supervision'])
display(table)
print(table.to_latex(multirow_align='t', convert_css=True))

### Seeded F1 Scores

In [None]:
seeded_f1_df = build_metric_df(seeded_table_df,
                               method='ruleset_generator',
                               metric='test_macro_f1',
                               labelled_methods=ruleset_generators)
table = display_metric_table(seeded_f1_df)
display(table)
print(table.to_latex(multirow_align='t', convert_css=True))

### F1 Score Friedman Test

We compare the overall performance of unseeded and seeded methods across all datasets at low and high interation counts with Friedman and Nemenyi post-hoc tests presented below.

In [None]:
full_f1_df = build_metric_df(table_df, method='method', metric='test_macro_f1')
display_friedman_test(full_f1_df, svg_file_prefix='plots/binary-')

### Standard Deviations

#### Unseeded Methods

In [None]:
unseeded_f1_std_df = build_metric_df(df[df['rule_seeder'] == BlankRS()],
                                     method='ruleset_generator',
                                     metric='test_macro_f1',
                                     rngseed_agg='std',
                                     labelled_methods=ruleset_generators)
display(display_metric_table(unseeded_f1_std_df))

#### Seeded Methods

In [None]:
seeded_f1_std_df = build_metric_df(df[df['rule_seeder'] == AccRS()],
                                   method='ruleset_generator',
                                   metric='test_macro_f1',
                                   rngseed_agg='std',
                                   labelled_methods=ruleset_generators)
display(display_metric_table(seeded_f1_std_df))

#### Median Standard Deviations

In [None]:
full_f1_std_df = pd.concat([
    unseeded_f1_std_df,
    seeded_f1_std_df.set_index(seeded_f1_std_df.index.map(lambda idx: (f'Seeded {idx[0]}', idx[1]))),
])
print('Binary classification table')
display(median_stds_df(full_f1_std_df, datasets=dataset_to_ic.keys(), ics=min_ic))
print('Binary classification line plot')
display(median_stds_df(full_f1_std_df, datasets=['imdb', 'bias_pa', 'fakenews', 'binary_agnews'], ics=full_ic))