In [1]:
#%pip install sklearn
#%pip install torch

from math import sqrt
import os
from time import time
import random
import numpy as np
from scipy import stats
import pandas as pd
import sklearn
from sklearn.base import clone
import torch
import torch.nn as nn
import torch.optim as optim
from IPython.display import display, HTML

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
DATA_NPZ = DATA_DIR + "data.npz"

## load files
data = np.load(DATA_NPZ)

X_2D = data['X_2D']
X_3D = data['X_3D']
y_crowd = data['y_crowd']
y_experts = data['y_experts']
y_combined = data['y_combined']

# retrieve indices of labeled samples
experts_idx = np.where(y_experts > -1)[0]
crowd_idx = np.where(y_crowd > -1)[0]

_crowd_unique_idx = np.setdiff1d(crowd_idx, experts_idx,
                                 assume_unique=True)
combined_idx = np.concatenate([_crowd_unique_idx, experts_idx])

In [2]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return seed
    
print(set_seed())  # make reproducable

2810509032


In [3]:
def create_splits(y, test_ratio=.5):
    train_idx = list()
    test_idx = list()
    
    strats = [np.where(y == lab)[0] for lab in np.unique(y) if lab > -1]
    for strat in strats:
        n = strat.shape[0]
        train_idx.append(strat[:int(n*(1-test_ratio))])
        test_idx.append(strat[int(n*(1-test_ratio)):])
        
    train_idx = np.concatenate(train_idx)
    test_idx = np.concatenate(test_idx)
    
    np.random.shuffle(train_idx)
    np.random.shuffle(test_idx)
    
    return (train_idx, test_idx)

def create_splits_one_hot(y):
    vec = -np.ones(y.shape[0])
    nonzero = y.nonzero()
    vec[nonzero[:,0]] = nonzero[:,1].float()
    
    return create_splits(vec)

def alpaydin_F_test(c1_acc_lst, c2_acc_lst):
    # acc_list := [np.array([acc_ij, acc_i(j+1)]) for i in 5, j in 2]
    assert len(c1_acc_lst) == len(c2_acc_lst)
    diff_acc_lst = [c1_acc_lst[i] - c2_acc_lst[i] for i in range(len(c1_acc_lst))]
    
    mean_lst = [np.mean(a) for a in diff_acc_lst] 
    var_lst = [ (diff_acc_lst[i][0] - mean_lst[i])**2
               +(diff_acc_lst[i][1] - mean_lst[i])**2 for i in range(len(diff_acc_lst))]
    
    numerator = sum([sum(a**2) for a in diff_acc_lst])
    denumerator = 2 * sum(var_lst)
    f = numerator / denumerator
    p_value = stats.f.sf(f, 10, 5)
    
    return (f, p_value, np.mean(mean_lst), np.mean(var_lst))    

In [4]:
# ensure same datasets per model
crowd_splits = [create_splits(y_crowd[crowd_idx]) for i in range(5)]
experts_splits = [create_splits(y_experts[experts_idx]) for i in range(5)]
combined_splits = [create_splits(y_combined[combined_idx]) for i in range(5)]

crowd_on_experts_splits = [create_splits(y_crowd[experts_idx]) for i in range(5)]
crowd_unique_splits = [create_splits(y_crowd[_crowd_unique_idx]) for i in range(5)]

# Majority Class

In [5]:
from collections import Counter

def majority_class(y):
    ct = Counter(y)
    return ct.most_common(1)[0][1] / len(y)

In [6]:
majority_class_acc_crowd = majority_class(y_crowd[crowd_idx])
majority_class_acc_experts = majority_class(y_experts[experts_idx])
majority_class_acc_combined = majority_class(y_combined[combined_idx])

print("\nMajority class accuracy on dominant labels (baseline)")
print(" crowd labels:  {:.4f}".format(majority_class_acc_crowd))
print(" expert labels: {:.4f}".format(majority_class_acc_experts))
print(" combined labels: {:.4f}".format(majority_class_acc_combined))


Majority class accuracy on dominant labels (baseline)
 crowd labels:  0.6355
 expert labels: 0.5345
 combined labels: 0.5741


# Random Forest (supervised)

We start with a traditional, or 'shallow', machine learning model: random forest. Because random forest does not support iterative learning, we test both the crowd and expert sets separately.

We use stratified cross validation to reduce the effects caused by the small size of the data set.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


N_ESTIMATORS = [100, 250, 500, 750, 1000, 2000]

def random_forest(X, y, index, splits, n_estimators=N_ESTIMATORS):
    n_samples = X[index].shape[0]
    acc_est_lst = list()
    for n_estimators in N_ESTIMATORS:
        print("Training with {} estimators".format(n_estimators))
        acc_lst = list()
        for fold_i in range(5):
            print(" Starting outer fold {} / {}".format(fold_i+1, 5))
            acc_inner = list()
            split_a_idx, split_b_idx  = splits[fold_i]
            for fold_j in range(2):
                print("  Starting inner fold {} / {}".format(fold_j+1, 2), end='')
                if fold_j % 2 == 0:
                    train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
                else:
                    train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx

                train_idx = index[train_fold_idx]
                test_idx = index[test_fold_idx]

                model = RandomForestClassifier(n_estimators=n_estimators)
                model.fit(X[train_idx], y[train_idx])

                y_pred = model.predict(X[test_idx])
                fold_acc = accuracy_score(y[test_idx], y_pred)

                acc_inner.append(fold_acc)
                print(" (acc: {:.4f})".format(fold_acc))

            acc_lst.append(np.array(acc_inner))
        print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))
        acc_est_lst.append(acc_lst)
    
    return acc_est_lst

In [8]:
print("=== Results of supervised learning on expert dominant labels ===")
random_forest_acc_experts_dominant = random_forest(X_2D,
                                                   y_experts, 
                                                   experts_idx,
                                                   experts_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_experts_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_experts_dominant[i],
                                               random_forest_acc_experts_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on expert dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.5333)
  Starting inner fold 2 / 2 (acc: 0.6429)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.6000)
  Starting inner fold 2 / 2 (acc: 0.6071)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.4000)
  Starting inner fold 2 / 2 (acc: 0.5000)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.6000)
  Starting inner fold 2 / 2 (acc: 0.5357)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.6000)
  Starting inner fold 2 / 2 (acc: 0.6429)
 => mean acc: 0.5662

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.5333)
  Starting inner fold 2 / 2 (acc: 0.6429)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.6071)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5333)
  Startin

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.271797,,,,,
2,500,0.090884,0.486303,,,,
3,750,0.052031,0.49241,0.415552,,,
4,1000,0.020404,0.414563,0.534881,0.234685,,
5,2000,0.043735,0.663432,0.291258,0.075373,0.534881,


In [9]:
print("=== Results of supervised learning on crowd dominant labels ===")
random_forest_acc_crowd_dominant = random_forest(X_2D,
                                                 y_crowd,
                                                 crowd_idx,
                                                 crowd_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_crowd_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_crowd_dominant[i],
                                               random_forest_acc_crowd_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on crowd dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.6111)
  Starting inner fold 2 / 2 (acc: 0.5283)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5741)
  Starting inner fold 2 / 2 (acc: 0.5849)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.6111)
  Starting inner fold 2 / 2 (acc: 0.5849)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.6296)
  Starting inner fold 2 / 2 (acc: 0.6038)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.6481)
  Starting inner fold 2 / 2 (acc: 0.5472)
 => mean acc: 0.5923

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.6481)
  Starting inner fold 2 / 2 (acc: 0.5094)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.6111)
  Starting inner fold 2 / 2 (acc: 0.5472)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5926)
  Starting

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.48025,,,,,
2,500,0.431662,0.676063,,,,
3,750,0.44098,0.63372,0.534881,,,
4,1000,0.607047,0.760611,0.782516,0.726688,,
5,2000,0.656283,0.822993,0.6978,0.610524,0.695393,


In [10]:
print("=== Results of supervised learning on combined dominant labels ===")
random_forest_acc_combined_dominant = random_forest(X_2D,
                                                    y_combined,
                                                    combined_idx,
                                                    combined_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_combined_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_combined_dominant[i],
                                               random_forest_acc_combined_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on combined dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.5926)
  Starting inner fold 2 / 2 (acc: 0.5926)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5741)
  Starting inner fold 2 / 2 (acc: 0.5370)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.6481)
  Starting inner fold 2 / 2 (acc: 0.4815)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.5556)
  Starting inner fold 2 / 2 (acc: 0.4630)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.5556)
  Starting inner fold 2 / 2 (acc: 0.5370)
 => mean acc: 0.5537

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.6852)
  Starting inner fold 2 / 2 (acc: 0.6296)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.6296)
  Starting inner fold 2 / 2 (acc: 0.5000)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5926)
  Start

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.563585,,,,,
2,500,0.385846,0.052149,,,,
3,750,0.713696,0.52526,0.648012,,,
4,1000,0.649448,0.318082,0.246339,0.718145,,
5,2000,0.660143,0.451359,0.775549,0.668008,0.676942,


# SGD

In [11]:
from sklearn.linear_model import PassiveAggressiveClassifier

N_FOLDS = 5
def pac(X, y, index):
    n_samples = X[index].shape[0]
    acc = 0.0
    best_score = -1
    best_model = None
    for fold_i in range(N_FOLDS):
        print(" Starting fold {} / {}".format(fold_i+1, N_FOLDS), end='')
        train_fold_idx, test_fold_idx  = create_splits(y[index], test_ratio=0.2)
        train_idx = index[train_fold_idx]
        test_idx = index[test_fold_idx]
        
        classes = np.unique(y)
        model = PassiveAggressiveClassifier(max_iter=2000, warm_start=False)
        model.partial_fit(X[train_idx], y[train_idx], classes)
        
        y_pred = model.predict(X[test_idx])
        fold_acc = accuracy_score(y[test_idx], y_pred)
        
        if best_score < 0 or best_score < (fold_acc - 0.02):
            best_score = fold_acc
            best_model = model

        acc += fold_acc
        print(" (acc: {:.4f})".format(fold_acc))

    acc /= N_FOLDS
    print("Mean accuracy on test set: {:.4f}\n".format(acc))
    
    return best_model

def pac_test(X, y, index, splits, model_pretrained=None):
    n_samples = X[index].shape[0]
    acc_lst = list()
    for fold_i in range(5):
        print(" Starting outer fold {} / {}".format(fold_i+1, 5), end='')
        acc_inner = list()
        split_a_idx, split_b_idx  = splits[fold_i]
        for fold_j in range(2):
            print("  Starting inner fold {} / {}".format(fold_j+1, 2), end='')
            if fold_j % 2 == 0:
                train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
            else:
                train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx

            train_idx = index[train_fold_idx]
            test_idx = index[test_fold_idx]
            
            classes = np.unique(y)
            if model_pretrained is None:
                model = PassiveAggressiveClassifier(max_iter=2000, warm_start=False)
            else:
                model = clone(model_pretrained)
            model.partial_fit(X[train_idx], y[train_idx], classes)
 
            y_pred = model.predict(X[test_idx])
            fold_acc = accuracy_score(y[test_idx], y_pred)

            acc_inner.append(fold_acc)
            print(" (acc: {:.4f})".format(fold_acc))
            
        acc_lst.append(np.array(acc_inner))
    print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))
    
    return acc_lst

In [12]:
print("=== Results of supervised learning on expert dominant labels ===")
model = pac(X_2D,
            y_experts, 
            experts_idx)
pac_acc_experts_dominant = pac_test(X_2D,
                                  y_experts, 
                                  experts_idx,
                                  experts_splits,
                                  model)

print("=== Results of supervised learning on crowd dominant labels ===")
model = pac(X_2D,
            y_crowd, 
            crowd_idx)
pac_acc_crowd_dominant = pac_test(X_2D,
                                y_crowd,
                                crowd_idx,
                                crowd_splits,
                                model)

print("=== Results of supervised learning of crowd labels on expert subset ===")
model = pac(X_2D,
            y_crowd, 
            experts_idx)
pac_acc_crowd_expert_dominant = pac_test(X_2D,
                                y_crowd,
                                experts_idx,
                                crowd_on_experts_splits,
                                model)

print("=== Results of supervised learning on crowd unique dominant labels ===")
model = pac(X_2D,
            y_crowd, 
            _crowd_unique_idx)
pac_acc_crowd_unique_dominant = pac_test(X_2D,
                                y_crowd,
                                _crowd_unique_idx,
                                crowd_unique_splits,
                                model)

print("=== Results of supervised learning on combined dominant labels ===")
model = pac(X_2D,
            y_combined, 
            combined_idx)
pac_acc_combined_dominant = pac_test(X_2D,
                                   y_combined,
                                   combined_idx,
                                   combined_splits,
                                   model)

pac_acc = [pac_acc_experts_dominant, pac_acc_crowd_dominant, pac_acc_crowd_expert_dominant, pac_acc_crowd_unique_dominant, pac_acc_combined_dominant]
labels = ['pac_acc_experts_dominant', 'pac_acc_crowd_dominant', 'pac_acc_crowd_expert_dominant', 'pac_acc_crowd_unique_dominant', 'pac_acc_combined_dominant']
print("= p-values =")
table = {'p-values': labels}
table.update({lab: list() for lab in labels})
nlabels = len(labels)
for i in range(nlabels):
    for e in range(i+1):
        table[labels[i]].append(np.nan)
    for j in range(i+1, nlabels):
        f, p, mean, variance = alpaydin_F_test(pac_acc[i],
                                               pac_acc[j])
        table[labels[i]].append(p)
        #print("RF {} vs {} estimators".format(pac_acc[i], pac_acc[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on expert dominant labels ===
 Starting fold 1 / 5 (acc: 0.4615)
 Starting fold 2 / 5 (acc: 0.3846)
 Starting fold 3 / 5 (acc: 0.6154)
 Starting fold 4 / 5 (acc: 0.7692)
 Starting fold 5 / 5 (acc: 0.6154)
Mean accuracy on test set: 0.5692

 Starting outer fold 1 / 5  Starting inner fold 1 / 2 (acc: 0.6333)
  Starting inner fold 2 / 2 (acc: 0.5714)
 Starting outer fold 2 / 5  Starting inner fold 1 / 2 (acc: 0.5667)
  Starting inner fold 2 / 2 (acc: 0.5357)
 Starting outer fold 3 / 5  Starting inner fold 1 / 2 (acc: 0.4667)
  Starting inner fold 2 / 2 (acc: 0.6071)
 Starting outer fold 4 / 5  Starting inner fold 1 / 2 (acc: 0.5333)
  Starting inner fold 2 / 2 (acc: 0.6071)
 Starting outer fold 5 / 5  Starting inner fold 1 / 2 (acc: 0.6000)
  Starting inner fold 2 / 2 (acc: 0.4643)
 => mean acc: 0.5586

=== Results of supervised learning on crowd dominant labels ===
 Starting fold 1 / 5 (acc: 0.4545)
 Starting fold 2 / 5 (acc: 0.5455)
 Starting fold 3 / 

Unnamed: 0,p-values,pac_acc_experts_dominant,pac_acc_crowd_dominant,pac_acc_crowd_expert_dominant,pac_acc_crowd_unique_dominant,pac_acc_combined_dominant
0,pac_acc_experts_dominant,,,,,
1,pac_acc_crowd_dominant,0.483699,,,,
2,pac_acc_crowd_expert_dominant,0.576215,0.568319,,,
3,pac_acc_crowd_unique_dominant,0.455372,0.675712,0.491883,,
4,pac_acc_combined_dominant,0.708626,0.545506,0.692186,0.361084,


# Incremental Learning

In [13]:
print("=== Pretrain expert labels on expert subset and further train crowd labels on crowd subset ===")
model = pac(X_2D,
                 y_experts, 
                 experts_idx)
pacv_acc_experts_dominant = pac_test(X_2D,
                                  y_crowd, 
                                  crowd_idx,
                                  crowd_splits,
                                  model)

print("=== Pretrain expert labels on expert subset and further train crowd labels on expert subset ===")
model = pac(X_2D,
                 y_experts, 
                 experts_idx)
pacv_acc_crowd_dominant = pac_test(X_2D,
                                y_crowd,
                                experts_idx,
                                crowd_on_experts_splits,
                                model)

print("=== Pretrain expert labels on expert subset and further train crowd labels on unique subset ===")
model = pac(X_2D,
                 y_experts, 
                 experts_idx)
pacv_acc_crowd_unique_dominant = pac_test(X_2D,
                                y_crowd,
                                _crowd_unique_idx,
                                crowd_unique_splits,
                                model)

print("=== Pretrain expert labels on expert subset and further train combined labels on all ===")
model = pac(X_2D,
                 y_experts, 
                 experts_idx)
pacv_acc_combined_dominant = pac_test(X_2D,
                                   y_combined,
                                   combined_idx,
                                   combined_splits,
                                   model)

pac_acc = [pacv_acc_experts_dominant, pacv_acc_crowd_dominant, pacv_acc_crowd_unique_dominant, pacv_acc_combined_dominant]
labels = ['pacv_acc_experts_dominant', 'pacv_acc_crowd_dominant', 'pacv_acc_crowd_unique_dominant', 'pacv_acc_combined_dominant']

print("= p-values =")
table = {'p-values': labels}
table.update({lab: list() for lab in labels})
nlabels = len(labels)
for i in range(nlabels):
    for e in range(i+1):
        table[labels[i]].append(np.nan)
    for j in range(i+1, nlabels):
        f, p, mean, variance = alpaydin_F_test(pac_acc[i],
                                               pac_acc[j])
        table[labels[i]].append(p)
        #print("RF {} vs {} estimators".format(pac_acc[i], pac_acc[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Pretrain expert labels on expert subset and further train crowd labels on crowd subset ===
 Starting fold 1 / 5 (acc: 0.6154)
 Starting fold 2 / 5 (acc: 0.7692)
 Starting fold 3 / 5 (acc: 0.5385)
 Starting fold 4 / 5 (acc: 0.6154)
 Starting fold 5 / 5 (acc: 0.6154)
Mean accuracy on test set: 0.6308

 Starting outer fold 1 / 5  Starting inner fold 1 / 2 (acc: 0.5926)
  Starting inner fold 2 / 2 (acc: 0.5472)
 Starting outer fold 2 / 5  Starting inner fold 1 / 2 (acc: 0.5185)
  Starting inner fold 2 / 2 (acc: 0.4906)
 Starting outer fold 3 / 5  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.4717)
 Starting outer fold 4 / 5  Starting inner fold 1 / 2 (acc: 0.5556)
  Starting inner fold 2 / 2 (acc: 0.5094)
 Starting outer fold 5 / 5  Starting inner fold 1 / 2 (acc: 0.5185)
  Starting inner fold 2 / 2 (acc: 0.5660)
 => mean acc: 0.5270

=== Pretrain expert labels on expert subset and further train crowd labels on expert subset ===
 Starting fold 1 / 5 (acc: 

Unnamed: 0,p-values,pacv_acc_experts_dominant,pacv_acc_crowd_dominant,pacv_acc_crowd_unique_dominant,pacv_acc_combined_dominant
0,pacv_acc_experts_dominant,,,,
1,pacv_acc_crowd_dominant,0.559912,,,
2,pacv_acc_crowd_unique_dominant,0.059888,0.223629,,
3,pacv_acc_combined_dominant,0.233069,0.512902,0.013538,


# PyTorch Preparations

In [14]:
## convert numpy arrays to PyTorch tensors
X_2D = torch.from_numpy(X_2D)
X_3D = torch.from_numpy(X_3D)

y_crowd = torch.from_numpy(y_crowd)
y_experts = torch.from_numpy(y_experts)
y_combined = torch.from_numpy(y_combined)

In [15]:
def categorical_accuracy(y_hat, y):
    # y := 1D array of class labels
    # y_hat := 2D array of one-hot class labels
    _, labels = y_hat.max(dim=1)
    return torch.mean(torch.eq(labels, y).float())

def fit(model, X, y, index, lr=0.01, l2norm=0.01, n_folds=10, n_epoch=250, patience=-1, state=None, finetune=False):
    n_samples = X[index].shape[0]

    loss = 0
    acc = 0
    best_state = None
    best_state_opt = None
    best_score = -1
    for fold_i in range(n_folds):
        print("Starting fold {} / {}".format(fold_i+1, n_folds), end='')
        if state is None:
            model.init()
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
        else:
            model.load_state_dict(state[0])
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
            optimizer.load_state_dict(state[1])
            if finetune:
                for layer in model.layers[:-1]:
                    layer.requires_grad = False
            
        criterion = nn.CrossEntropyLoss()
        
        # early stopping
        patience_left = patience
        best_fold_score = -1
        delta = 1e-4
        best_fold_state = None
        best_fold_state_opt = None
        
        train_fold_idx, test_fold_idx  = create_splits(y[index])
        train_idx = index[train_fold_idx]
        test_idx = index[test_fold_idx]
        for epoch in range(n_epoch):
            model.train()
            
            y_hat = model(X[train_idx].float())
            train_acc = categorical_accuracy(y_hat, y[train_idx])
            train_loss = criterion(y_hat, y[train_idx].long())
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            
            model.eval()
            test_loss = None
            with torch.no_grad():
                y_hat = model(X[test_idx].float())
                test_acc = categorical_accuracy(y_hat, y[test_idx])
                test_loss = criterion(y_hat, y[test_idx].long())
                
            train_loss = float(train_loss.item())
            test_loss = float(test_loss.item())

            if best_fold_score < 0:
                best_fold_score = test_loss
                best_fold_state = model.state_dict()
                best_fold_state_opt = optimizer.state_dict()
                            
            if patience <= 0:
                continue
            if test_loss >= best_fold_score - delta:
                patience_left -= 1
            else:
                best_fold_score = test_loss
                best_fold_state = model.state_dict()
                best_fold_state_opt = optimizer.state_dict()
                patience_left = patience
            if patience_left <= 0:
                model.load_state_dict(best_fold_state)
                optimizer.load_state_dict(best_fold_state_opt)
                break
                
        test_idx = index[create_splits(y[index])[1]]  # get new random test set to validate on
        with torch.no_grad():
            y_hat = model(X[test_idx].float())
            test_acc = categorical_accuracy(y_hat, y[test_idx])
            test_loss = float(criterion(y_hat, y[test_idx].long()).item())
        
        loss += test_loss
        acc += test_acc
        if best_score < 0 or best_score > test_loss:
            best_state = best_fold_state
            best_state_opt = best_fold_state_opt
            best_score = test_loss
        print(" - training accuracy: {:.4f} / loss: {:.4f} - test accuracy: {:.4f} / loss: {:.4f}".format(train_acc,
                                                                                          train_loss,
                                                                                          test_acc,
                                                                                          test_loss))
        
    loss /= n_folds
    acc /= n_folds
    print("average loss on test set: {:.4f}".format(loss))
    print("average accuracy on test set: {:.4f}".format(acc))
    
    return (best_state, best_state_opt)

def fit_test(model, X, y, index, splits, lr=0.01, l2norm=0.01, n_epoch=250, patience=-1, state=None, finetune=False):
    n_samples = X[index].shape[0]
    acc_lst = list()
    for fold_i in range(5):
        print("Starting outer fold {} / {}".format(fold_i+1, 5))
        acc_inner = list()
        split_a_idx, split_b_idx  = splits[fold_i]

        for fold_j in range(2):
            print(" Starting inner fold {} / {}".format(fold_j+1, 2), end='')
            if fold_j % 2 == 0:
                train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
            else:
                train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx
                
            train_idx = index[train_fold_idx]
            test_idx = index[test_fold_idx]
        
            if state is None:
                model.init()
                optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
            else:
                model.load_state_dict(state[0])
                optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
                optimizer.load_state_dict(state[1])
                if finetune:
                    for layer in model.layers[:-1]:
                        layer.requires_grad = False

            criterion = nn.CrossEntropyLoss()

            # early stopping
            patience_left = patience
            best_fold_score = -1
            delta = 1e-4
            best_fold_state = None
            best_fold_state_opt = None
        
            for epoch in range(n_epoch):
                model.train()

                y_hat = model(X[train_idx].float())
                train_acc = categorical_accuracy(y_hat, y[train_idx])
                train_loss = criterion(y_hat, y[train_idx].long())
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                model.eval()
                test_loss = None
                with torch.no_grad():
                    y_hat = model(X[test_idx].float())
                    test_acc = categorical_accuracy(y_hat, y[test_idx])
                    test_loss = criterion(y_hat, y[test_idx].long())

                train_loss = float(train_loss.item())
                test_loss = float(test_loss.item())
                
                if best_fold_score < 0:
                    best_fold_score = test_loss
                    best_fold_state = model.state_dict()
                    best_fold_state_opt = optimizer.state_dict()

                if patience <= 0:
                    continue
                if test_loss >= best_fold_score - delta:
                    patience_left -= 1
                else:
                    best_fold_score = test_loss
                    best_fold_state = model.state_dict()
                    best_fold_state_opt = optimizer.state_dict()
                    patience_left = patience
                if patience_left <= 0:
                    model.load_state_dict(best_fold_state)
                    optimizer.load_state_dict(best_fold_state_opt)
                    break
            
            # do a final run over the test set after loading a previous state
            with torch.no_grad():
                y_hat = model(X[test_idx].float())
                test_acc = categorical_accuracy(y_hat, y[test_idx])
                test_loss = criterion(y_hat, y[test_idx].long())
            
            test_loss = float(test_loss.item())
            print(" (acc: {:.4f})".format(test_acc))
            acc_inner.append(test_acc)
    
        acc_lst.append(np.array(acc_inner))
        
    print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))

    return acc_lst

# Neural Network

In [16]:
class ClassifierNN(nn.Module):
    """Simple Neural Network Classifier"""

    def __init__(self, input_dim, output_dim, p_dropout=0.05):
        super().__init__()
        hidden_dim = (input_dim-output_dim)//2
        
        self.layers = nn.ModuleList()
        self.layers.append(nn.Sequential(
                            nn.Linear(input_dim, hidden_dim),
                            nn.ReLU(inplace=True),
                            nn.Dropout(p=p_dropout)))
            
        self.layers.append(nn.Sequential(
                            nn.Linear(hidden_dim, output_dim),
                            nn.ReLU(inplace=True)))
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, X):
        for layer in self.layers:
            X = layer(X)                          
                           
        return self.softmax(X)
        
    def init(self):
        for param in self.parameters():
            nn.init.normal_(param)

In [17]:
## hyperparameters
lr = 0.01
n_epoch = 250
p_dropout = 0.1

## define model
indim = X_2D.shape[1]
outdim = np.unique(y_experts[experts_idx]).shape[0]
assert outdim == np.unique(y_crowd[crowd_idx]).shape[0]

model = ClassifierNN(input_dim=indim,
                     output_dim=outdim,
                     p_dropout=p_dropout)

print("=== Results on expert dominant labels ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
neural_net_acc_dominant_experts = fit_test(model, X_2D, y_experts, experts_idx, experts_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Results on crowd dominant labels ===")
state = fit(model, X_2D, y_crowd, crowd_idx, lr=lr, n_epoch=n_epoch)
neural_net_acc_dominant_crowd = fit_test(model, X_2D, y_crowd, crowd_idx, crowd_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Results on combined dominant labels ===")
state = fit(model, X_2D, y_combined, combined_idx, lr=lr, n_epoch=n_epoch)
neural_net_acc_dominant_combined = fit_test(model, X_2D, y_combined, combined_idx, combined_splits, lr=lr, n_epoch=n_epoch, state=state)

=== Results on expert dominant labels ===
Starting fold 1 / 10 - training accuracy: 0.7143 / loss: 0.5112 - test accuracy: 0.6333 / loss: 0.6660
Starting fold 2 / 10 - training accuracy: 0.9286 / loss: 0.3849 - test accuracy: 0.4667 / loss: 0.7976
Starting fold 3 / 10 - training accuracy: 0.9286 / loss: 0.3694 - test accuracy: 0.5333 / loss: 0.7229
Starting fold 4 / 10 - training accuracy: 0.9643 / loss: 0.3463 - test accuracy: 0.5667 / loss: 0.7055
Starting fold 5 / 10 - training accuracy: 1.0000 / loss: 0.3406 - test accuracy: 0.5333 / loss: 0.7357
Starting fold 6 / 10 - training accuracy: 0.8571 / loss: 0.4617 - test accuracy: 0.5000 / loss: 0.7685
Starting fold 7 / 10 - training accuracy: 0.9643 / loss: 0.3770 - test accuracy: 0.7000 / loss: 0.6450
Starting fold 8 / 10 - training accuracy: 1.0000 / loss: 0.3256 - test accuracy: 0.5667 / loss: 0.7209
Starting fold 9 / 10 - training accuracy: 0.9286 / loss: 0.3627 - test accuracy: 0.5667 / loss: 0.7257
Starting fold 10 / 10 - trainin

In [18]:
nn_acc = [neural_net_acc_dominant_experts, neural_net_acc_dominant_crowd, neural_net_acc_dominant_combined]
labels = ['neural_net_acc_dominant_experts', 'neural_net_acc_dominant_crowd', 'neural_net_acc_dominant_combined']

print("= p-values =")
table = {'p-values': labels}
table.update({lab: list() for lab in labels})
nlabels = len(labels)
for i in range(nlabels):
    for e in range(i+1):
        table[labels[i]].append(np.nan)
    for j in range(i+1, nlabels):
        f, p, mean, variance = alpaydin_F_test(nn_acc[i],
                                               nn_acc[j])
        table[labels[i]].append(p)
        #print("RF {} vs {} estimators".format(nn_acc[i], nn_acc[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

= p-values =


Unnamed: 0,p-values,neural_net_acc_dominant_experts,neural_net_acc_dominant_crowd,neural_net_acc_dominant_combined
0,neural_net_acc_dominant_experts,,,
1,neural_net_acc_dominant_crowd,0.681507,,
2,neural_net_acc_dominant_combined,0.738612,0.134837,


# Incremental Learning

In [None]:
## hyperparameters
lr = 0.01
n_epoch = 250
p_dropout = 0.1

## define model
indim = X_2D.shape[1]
outdim = np.unique(y_experts[experts_idx]).shape[0]
assert outdim == np.unique(y_crowd[crowd_idx]).shape[0]

model = ClassifierNN(input_dim=indim,
                     output_dim=outdim,
                     p_dropout=p_dropout)

print("=== Pretrain on expert dominant labels; tune on crowd labels on crowd subset ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, crowd_idx, lr=lr, n_epoch=n_epoch, state=state)
vneural_net_acc_dominant_experts_crowd = fit_test(model, X_2D, y_crowd, crowd_idx, crowd_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Pretrain on expert dominant labels; tune on crowd labels on crowd subset; finetune ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, crowd_idx, lr=lr, n_epoch=n_epoch, state=state, finetune=True)
vneural_net_acc_dominant_experts_crowd_ft = fit_test(model, X_2D, y_crowd, crowd_idx, crowd_splits, lr=lr, n_epoch=n_epoch, state=state, finetune=True)


print("\n\n=== Pretrain on expert dominant labels; tune on crowd labels on crowd unique subset ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, crowd_idx, lr=lr, n_epoch=n_epoch, state=state)
vneural_net_acc_dominant_experts_crowdu = fit_test(model, X_2D, y_crowd, crowd_idx, crowd_unique_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Pretrain on expert dominant labels; tune on crowd labels on crowd unique subset; finetune ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, crowd_idx, lr=lr, n_epoch=n_epoch, state=state, finetune=True)
vneural_net_acc_dominant_experts_crowdu_ft = fit_test(model, X_2D, y_crowd, crowd_idx, crowd_unique_splits, lr=lr, n_epoch=n_epoch, state=state, finetune=True)


print("\n\n=== Pretrain on expert dominant labels; tune on crowd labels on expert subset ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, experts_idx, lr=lr, n_epoch=n_epoch, state=state)
vneural_net_acc_dominant_experts_crowdexp = fit_test(model, X_2D, y_crowd, experts_idx, crowd_on_experts_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Pretrain on expert dominant labels; tune on crowd labels on expert subset; finetune ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_crowd, experts_idx, lr=lr, n_epoch=n_epoch, state=state, finetune=True)
vneural_net_acc_dominant_experts_crowdexp_ft = fit_test(model, X_2D, y_crowd, experts_idx, crowd_on_experts_splits, lr=lr, n_epoch=n_epoch, state=state, finetune=True)


print("\n\n=== Pretrain on expert dominant labels; tune on combined labels on all ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_combined, combined_idx, lr=lr, n_epoch=n_epoch, state=state)
vneural_net_acc_dominant_experts_comb = fit_test(model, X_2D, y_combined, combined_idx, combined_splits, lr=lr, n_epoch=n_epoch, state=state)

print("\n=== Pretrain on expert dominant labels; tune on combined labels on all; finetune ===")
state = fit(model, X_2D, y_experts, experts_idx, lr=lr, n_epoch=n_epoch)
state = fit(model, X_2D, y_combined, combined_idx, lr=lr, n_epoch=n_epoch, state=state, finetune=True)
vneural_net_acc_dominant_experts_comb = fit_test(model, X_2D, y_combined, combined_idx, combined_splits, lr=lr, n_epoch=n_epoch, state=state, finetune=True)



=== Pretrain on expert dominant labels; tune on crowd labels on crowd subset ===
Starting fold 1 / 10 - training accuracy: 0.9286 / loss: 0.3653 - test accuracy: 0.6333 / loss: 0.6992
Starting fold 2 / 10 - training accuracy: 0.8571 / loss: 0.4039 - test accuracy: 0.5333 / loss: 0.6851
Starting fold 3 / 10 - training accuracy: 0.9286 / loss: 0.4025 - test accuracy: 0.4667 / loss: 0.7599
Starting fold 4 / 10 - training accuracy: 0.9643 / loss: 0.3605 - test accuracy: 0.5667 / loss: 0.6913
Starting fold 5 / 10 - training accuracy: 0.9643 / loss: 0.3588 - test accuracy: 0.6000 / loss: 0.6727
Starting fold 6 / 10 - training accuracy: 1.0000 / loss: 0.3267 - test accuracy: 0.5333 / loss: 0.7595
Starting fold 7 / 10 - training accuracy: 0.8214 / loss: 0.4798 - test accuracy: 0.6333 / loss: 0.6427
Starting fold 8 / 10 - training accuracy: 0.9643 / loss: 0.3631 - test accuracy: 0.6333 / loss: 0.6786
Starting fold 9 / 10 - training accuracy: 0.8929 / loss: 0.4510 - test accuracy: 0.6000 / loss:

 Starting inner fold 2 / 2 (acc: 0.5417)
Starting outer fold 3 / 5
 Starting inner fold 1 / 2 (acc: 0.5769)
 Starting inner fold 2 / 2 (acc: 0.5417)
Starting outer fold 4 / 5
 Starting inner fold 1 / 2 (acc: 0.5769)
 Starting inner fold 2 / 2 (acc: 0.5833)
Starting outer fold 5 / 5
 Starting inner fold 1 / 2 (acc: 0.5769)
 Starting inner fold 2 / 2 (acc: 0.5417)
 => mean acc: 0.5715


=== Pretrain on expert dominant labels; tune on crowd labels on crowd unique subset; finetune ===
Starting fold 1 / 10 - training accuracy: 0.9643 / loss: 0.3638 - test accuracy: 0.5667 / loss: 0.6958
Starting fold 2 / 10 - training accuracy: 0.8571 / loss: 0.4712 - test accuracy: 0.4667 / loss: 0.8064
Starting fold 3 / 10 - training accuracy: 0.9643 / loss: 0.3578 - test accuracy: 0.4667 / loss: 0.7041
Starting fold 4 / 10 - training accuracy: 0.9286 / loss: 0.3723 - test accuracy: 0.5000 / loss: 0.7506
Starting fold 5 / 10 - training accuracy: 0.9286 / loss: 0.3844 - test accuracy: 0.6333 / loss: 0.6860

In [None]:
rf_acc = [random_forest_acc_experts_dominant, random_forest_acc_crowd_dominant, random_forest_acc_combined_dominant]

nhypotheses = len(N_ESTIMATORS)
for k in range(nhypotheses):
    table = {'p-values - {} estimators'.format(N_ESTIMATORS[k]): labels}
    table.update({lab: list() for lab in labels})

    nlabels = len(labels)
    for i in range(nlabels):
        for e in range(nlabels):
            if e != i:
                table[labels[i]].append(np.nan)
            else:
                f, p, mean, variance = alpaydin_F_test(nn_acc[i],
                                                       rf_acc[j][k])
                table[labels[i]].append(p)
                #print("RF {} vs {} estimators".format(nn_acc[i], nn_acc[j]))
                #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

    significance = pd.DataFrame(table)
    display(significance)