# Binary-class Experiments

In [None]:
import pandas as pd

from witan_experiments import (is_cached,
                               save_to_cache,
                               load_from_cache,
                               run_experiments)
from witan_experiments.evaluation import (summarise_experiments,
                                          metric_line_grid,
                                          build_metric_df,
                                          display_metric_table,
                                          display_friedman_test)
from witan_experiments.config import prepare_experiment_configs
from witan_experiments.rule_seeders import BlankRS, AccRS
from witan_experiments.rule_generators import (TrueRG,
                                               IWSBinaryRG,
                                               WitanRG,
                                               SnubaRG,
                                               SemiSupervisedRG,
                                               ActiveLearningRG)
from witan_experiments.labellers import SnorkelLblr
from witan_experiments.models import AnnClf
from witan_experiments.utils import inverse_dict

In [None]:
ruleset_generators = {
    'Full supervision': TrueRG(),
    'Wɪᴛᴀɴ': WitanRG(),
    'Wɪᴛᴀɴ-Core': WitanRG(a=False, o=1),
    'IWS-AS': IWSBinaryRG(acq='AS'),
    'IWS-LSE-AC': IWSBinaryRG(acq='LSE'),
    'Snuba': SnubaRG(),
    'Semi-supervised': SemiSupervisedRG(),
    'Active learning': ActiveLearningRG(clf=AnnClf(), init_count=0),
}

base_config = dict(
    rule_seeder=[BlankRS()],
    rngseed=[1],
    ruleset_generator=list(ruleset_generators.values()),
    labeller=[SnorkelLblr()],
    classifier=[AnnClf()],
)

seeded_config = {
    **base_config,
    **dict(
        rule_seeder=[AccRS()],
        ruleset_generator=[
            ruleset_generators['Wɪᴛᴀɴ'],
            ruleset_generators['Wɪᴛᴀɴ-Core'],
            ruleset_generators['IWS-AS'],
            ruleset_generators['IWS-LSE-AC'],
        ],
    )
}

full_ic = [10, 25, 50, 100, 150, 200]
min_ic = [25, 100]
dataset_to_ic = {
    'imdb': full_ic,
    'bias_pa': full_ic,
    'bias_pt': min_ic,
    'bias_jp': min_ic,
    'bias_pp': min_ic,
    'amazon': min_ic,
    'yelp': min_ic,
    'plots': min_ic,
    'fakenews': full_ic,
    'binary_dbpedia': min_ic,
    'binary_agnews': full_ic,
    'airline_tweets': min_ic,
    'damage': min_ic,
    'spam': min_ic,
}
dataset_configs = {
    dataset: [
        *prepare_experiment_configs(**base_config, dataset_name=[dataset], interaction_count=ic),
        *prepare_experiment_configs(**seeded_config, dataset_name=[dataset], interaction_count=ic),
    ]
    for dataset, ic in dataset_to_ic.items()
}

In [None]:
CACHE_KEY = 'binary-experiments'

if not is_cached(CACHE_KEY):
    dfs = []
    for dataset, configs in dataset_configs.items():
        print(f'\nRunning experiments for: {dataset}')
        dataset_results = run_experiments(
            configs,
            default_workers=2,
            rule_workers=1,
            continue_on_failure=False,
        )
        dfs.append(summarise_experiments(dataset_results, workers=8))
    df = pd.concat(dfs)
    save_to_cache(CACHE_KEY, df)

df = load_from_cache(CACHE_KEY)

In [None]:
table_df = df[df['interaction_count'].isin([25, 100])].copy()

# Use friendly seeded/unseeded ruleset_generator names
rg_label_map = inverse_dict(ruleset_generators)

def method_label(row):
    if row['rule_seeder'] == BlankRS():
        seeding = ''
    elif row['rule_seeder'] == AccRS():
        seeding = 'Seeded '
    else:
        raise ValueError('Unknown rule_seeder')
    rg_label = rg_label_map[row['ruleset_generator']]
    return f'{seeding}{rg_label}'

table_df['method'] = table_df.apply(method_label, axis=1)

# Apply different filters to results.
full_supervision_table_df = table_df[table_df['ruleset_generator'].isin([ruleset_generators['Full supervision']])]
table_df = table_df[
    ~table_df['ruleset_generator'].isin([ruleset_generators['Full supervision']])
]
unseeded_table_df = table_df[table_df['rule_seeder'] == BlankRS()]
seeded_table_df = table_df[table_df['rule_seeder'] == AccRS()]

## F1 Score Results

In [None]:
fig = metric_line_grid(
    df[df['dataset_name'].isin(['imdb', 'bias_pa', 'fakenews', 'binary_agnews'])],
    metric='test_macro_f1',
    facet_row='dataset_name',
    facet_col='rule_seeder',
    ruleset_generators=ruleset_generators,
)
fig.write_image('plots/binary-f1-lines.svg')
fig.show()

### Unseeded F1 Scores

In [None]:
unseeded_f1_df = build_metric_df(pd.concat([full_supervision_table_df, unseeded_table_df]),
                                 method='ruleset_generator',
                                 metric='test_macro_f1',
                                 labelled_methods=ruleset_generators)
table = display_metric_table(unseeded_f1_df, rank_excluded_methods=['Full supervision'])
display(table)
print(table.to_latex(multirow_align='t', convert_css=True))

### Seeded F1 Scores

In [None]:
seeded_f1_df = build_metric_df(seeded_table_df,
                               method='ruleset_generator',
                               metric='test_macro_f1',
                               labelled_methods=ruleset_generators)
table = display_metric_table(seeded_f1_df)
display(table)
print(table.to_latex(multirow_align='t', convert_css=True))

### F1 Score Friedman Test

We compare the overall performance of unseeded and seeded methods across all datasets at low and high interation counts with Friedman and Nemenyi post-hoc tests presented below.

In [None]:
full_f1_df = build_metric_df(table_df, method='method', metric='test_macro_f1')
display_friedman_test(full_f1_df, svg_file_prefix='plots/binary-')

## Runtime

We compare the runtimes of rule generation methods after 25 and 100 user interactions. We highlight differences in runtime seconds compared to the baseline of `Wɪᴛᴀɴ`.

In [None]:
runtime_df = build_metric_df(unseeded_table_df,
                             method='ruleset_generator',
                             metric='rule_gen_wall_secs',
                             labelled_methods=ruleset_generators)
table = display_metric_table(runtime_df, baseline_label='Wɪᴛᴀɴ', small_margin=10, big_margin=60,
                             larger_is_better=False, formatter='{:.1f}')
display(table)
print(table.to_latex(multirow_align='t', convert_css=True))