In [1]:
#%pip install sklearn
#%pip install torch

from math import sqrt
import os
from time import time
import random
import numpy as np
from scipy import stats
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.optim as optim
from IPython.display import display, HTML

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
DATA_NPZ = DATA_DIR + "data.npz"

## load files
data = np.load(DATA_NPZ)

X_2D = data['X_2D']
X_3D = data['X_3D']
y_likert_crowd = data['y_likert_crowd']
y_likert_experts = data['y_likert_experts']
y_dominant_crowd = data['y_dominant_crowd']
y_dominant_experts = data['y_dominant_experts']
y_likert_combined = data['y_likert_combined']
y_dominant_combined = data['y_dominant_combined']


# likert
likert_expert_idx = np.where(y_likert_experts > -1)[0]
likert_crowd_idx = np.where(y_likert_crowd > -1)[0]

_likert_crowd_unique_idx = np.setdiff1d(likert_crowd_idx,
                                likert_expert_idx,
                                assume_unique=True)
likert_combined_idx = np.concatenate([_likert_crowd_unique_idx,  # favour expert labels
                                      likert_expert_idx])

# dominant
dominant_expert_idx = np.where(y_dominant_experts > -1)[0]
dominant_crowd_idx = np.where(y_dominant_crowd > -1)[0]

_dominant_crowd_unique_idx = np.setdiff1d(dominant_crowd_idx,
                                  dominant_expert_idx,
                                  assume_unique=True)
dominant_combined_idx = np.concatenate([_dominant_crowd_unique_idx,
                                        dominant_expert_idx])

In [2]:
# add signal to input data which tells the model from which set the labels are
def add_bias_2D(args):
    b = np.zeros((X_2D.shape[0], 1))
    for idx, label in args:
        b[idx] = label  # label = -1.0 | 1.0

    return np.hstack([b, X_2D])

def add_bias_3D(args):
    b = np.zeros((X_3D.shape[0], 1, X_3D.shape[2]))
    for idx, label in args:
        b[idx, 0] = label  # label = -1.0 | 1.0
    
    return np.hstack([b.T, X_3D.T]).T

X_2D_likert_expert_bias = add_bias_2D([(likert_expert_idx, 1.0)])
X_2D_likert_crowd_bias = add_bias_2D([(likert_crowd_idx, -1.0)])
X_2D_dominant_expert_bias = add_bias_2D([(dominant_expert_idx, 1.0)])
X_2D_dominant_crowd_bias = add_bias_2D([(dominant_crowd_idx, -1.0)])
X_2D_likert_combined_bias = add_bias_2D([(likert_combined_idx, 1.0), (_likert_crowd_unique_idx, -1.0)])
X_2D_dominant_combined_bias = add_bias_2D([(dominant_combined_idx, 1.0), (_dominant_crowd_unique_idx, -1.0)])


X_3D_likert_expert_bias = add_bias_3D([(likert_expert_idx, 1.0)])
X_3D_likert_crowd_bias = add_bias_3D([(likert_crowd_idx, -1.0)])
X_3D_dominant_expert_bias = add_bias_3D([(dominant_expert_idx, 1.0)])
X_3D_dominant_crowd_bias = add_bias_3D([(dominant_crowd_idx, -1.0)])
X_3D_likert_combined_bias = add_bias_3D([(likert_combined_idx, 1.0), (_likert_crowd_unique_idx, -1.0)])
X_3D_dominant_combined_bias = add_bias_3D([(dominant_combined_idx, 1.0), (_dominant_crowd_unique_idx, -1.0)])

In [3]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return seed
    
print(set_seed())  # make reproducable

4187098763


In [4]:
def create_splits(y, test_ratio=.5):
    train_idx = list()
    test_idx = list()
    
    strats = [np.where(y == lab)[0] for lab in np.unique(y) if lab > -1]
    for strat in strats:
        n = strat.shape[0]
        train_idx.append(strat[:int(n*(1-test_ratio))])
        test_idx.append(strat[int(n*(1-test_ratio)):])
        
    train_idx = np.concatenate(train_idx)
    test_idx = np.concatenate(test_idx)
    
    np.random.shuffle(train_idx)
    np.random.shuffle(test_idx)
    
    return (train_idx, test_idx)

def create_splits_one_hot(y):
    vec = -np.ones(y.shape[0])
    nonzero = y.nonzero()
    vec[nonzero[:,0]] = nonzero[:,1].float()
    
    return create_splits(vec)

def alpaydin_F_test(c1_acc_lst, c2_acc_lst):
    # acc_list := [np.array([acc_ij, acc_i(j+1)]) for i in 5, j in 2]
    assert len(c1_acc_lst) == len(c2_acc_lst)
    diff_acc_lst = [c1_acc_lst[i] - c2_acc_lst[i] for i in range(len(c1_acc_lst))]
    
    mean_lst = [np.mean(a) for a in diff_acc_lst] 
    var_lst = [ (diff_acc_lst[i][0] - mean_lst[i])**2
               +(diff_acc_lst[i][1] - mean_lst[i])**2 for i in range(len(diff_acc_lst))]
    
    numerator = sum([sum(a**2) for a in diff_acc_lst])
    denumerator = 2 * sum(var_lst)
    f = numerator / denumerator
    p_value = stats.f.sf(f, 10, 5)
    
    return (f, p_value, np.mean(mean_lst), np.mean(var_lst))    

In [5]:
# ensure same datasets per model
likert_crowd_splits = [create_splits(y_likert_crowd[likert_crowd_idx]) for i in range(5)]
likert_expert_splits = [create_splits(y_likert_experts[likert_expert_idx]) for i in range(5)]
likert_combined_splits = [create_splits(y_likert_combined[likert_combined_idx]) for i in range(5)]

dominant_crowd_splits = [create_splits(y_dominant_crowd[dominant_crowd_idx]) for i in range(5)]
dominant_expert_splits = [create_splits(y_dominant_experts[dominant_expert_idx]) for i in range(5)]
dominant_combined_splits = [create_splits(y_dominant_combined[dominant_combined_idx]) for i in range(5)]

labels = ['expert_likert', 'crowd_likert', 'combined_likert',
          'expert_dominant', 'crowd_dominant', 'combined_dominant']


# Majority Class

In [6]:
from collections import Counter

def majority_class(y):
    ct = Counter(y)
    return ct.most_common(1)[0][1] / len(y)

In [7]:
majority_class_acc_crowd_likert = majority_class(y_likert_crowd[likert_crowd_idx])
majority_class_acc_experts_likert = majority_class(y_likert_experts[likert_expert_idx])
majority_class_acc_combined_likert = majority_class(y_likert_combined[likert_combined_idx])

print("Majority class accuracy on Likert labels (baseline)")
print(" crowd labels:  {:.4f}".format(majority_class_acc_crowd_likert))
print(" expert labels: {:.4f}".format(majority_class_acc_experts_likert))
print(" combined labels: {:.4f}".format(majority_class_acc_combined_likert))

majority_class_acc_crowd_dominant = majority_class(y_dominant_crowd[dominant_crowd_idx])
majority_class_acc_experts_dominant = majority_class(y_dominant_experts[dominant_expert_idx])
majority_class_acc_combined_dominant = majority_class(y_dominant_combined[dominant_combined_idx])

print("\nMajority class accuracy on Dominant labels (baseline)")
print(" crowd labels:  {:.4f}".format(majority_class_acc_crowd_dominant))
print(" expert labels: {:.4f}".format(majority_class_acc_experts_dominant))
print(" combined labels: {:.4f}".format(majority_class_acc_combined_dominant))

Majority class accuracy on Likert labels (baseline)
 crowd labels:  0.2437
 expert labels: 0.2241
 combined labels: 0.2521

Majority class accuracy on Dominant labels (baseline)
 crowd labels:  0.5400
 expert labels: 0.6383
 combined labels: 0.5766


# Random Forest (supervised)

We start with a traditional, or 'shallow', machine learning model: random forest. Because random forest does not support iterative learning, we test both the crowd and expert sets separately.

We use stratified cross validation to reduce the effects caused by the small size of the data set.

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


N_ESTIMATORS = [100, 250, 500, 750, 1000, 2000]

def random_forest(X, y, index, splits, n_estimators=N_ESTIMATORS):
    n_samples = X[index].shape[0]
    acc_est_lst = list()
    for n_estimators in N_ESTIMATORS:
        print("Training with {} estimators".format(n_estimators))
        acc_lst = list()
        for fold_i in range(5):
            print(" Starting outer fold {} / {}".format(fold_i+1, 5))
            acc_inner = list()
            split_a_idx, split_b_idx  = splits[fold_i]
            for fold_j in range(2):
                print("  Starting inner fold {} / {}".format(fold_j+1, 2), end='')
                if fold_j % 2 == 0:
                    train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
                else:
                    train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx

                train_idx = index[train_fold_idx]
                test_idx = index[test_fold_idx]

                model = RandomForestClassifier(n_estimators=n_estimators)
                model.fit(X[train_idx], y[train_idx])

                y_pred = model.predict(X[test_idx])
                fold_acc = accuracy_score(y[test_idx], y_pred)

                acc_inner.append(fold_acc)
                print(" (acc: {:.4f})".format(fold_acc))

            acc_lst.append(np.array(acc_inner))
        print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))
        acc_est_lst.append(acc_lst)
    
    return acc_est_lst

In [9]:
print("=== Results of supervised learning on expert likert labels ===")
random_forest_acc_experts_likert = random_forest(X_2D_likert_expert_bias,
                                                 y_likert_experts, 
                                                 likert_expert_idx,
                                                 likert_expert_splits)

table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_experts_likert)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_experts_likert[i],
                                               random_forest_acc_experts_likert[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on expert likert labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.0938)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.1562)
  Starting inner fold 2 / 2 (acc: 0.1538)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.2692)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.2188)
  Starting inner fold 2 / 2 (acc: 0.2308)
 => mean acc: 0.1882

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.1562)
  Starting inner fold 2 / 2 (acc: 0.2308)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.2188)
  Starting inner fold 2 / 2 (acc: 0.2308)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.2188)
  Starting 

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.009518,,,,,
2,500,0.460709,0.287369,,,,
3,750,0.347415,0.64405,0.534881,,,
4,1000,0.318804,0.684134,0.219655,0.476727,,
5,2000,0.439135,0.702782,0.252272,0.647048,0.76681,


In [10]:
print("=== Results of supervised learning on crowd likert labels ===")
random_forest_acc_crowd_likert = random_forest(X_2D_likert_crowd_bias,
                                               y_likert_crowd,
                                               likert_crowd_idx,
                                               likert_crowd_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_crowd_likert)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_crowd_likert[i],
                                               random_forest_acc_crowd_likert[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on crowd likert labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.1774)
  Starting inner fold 2 / 2 (acc: 0.2281)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.1290)
  Starting inner fold 2 / 2 (acc: 0.2105)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.1935)
  Starting inner fold 2 / 2 (acc: 0.1930)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.1774)
  Starting inner fold 2 / 2 (acc: 0.2456)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.1774)
  Starting inner fold 2 / 2 (acc: 0.2456)
 => mean acc: 0.1978

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.1613)
  Starting inner fold 2 / 2 (acc: 0.2105)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.1774)
  Starting inner fold 2 / 2 (acc: 0.2632)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.1935)
  Starting i

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.369726,,,,,
2,500,0.451041,0.291856,,,,
3,750,0.620907,0.484985,0.576704,,,
4,1000,0.136848,0.478471,0.407105,0.656842,,
5,2000,0.584182,0.534881,0.479906,0.534881,0.447021,


In [11]:
print("=== Results of supervised learning on combined likert labels ===")
random_forest_acc_combined_likert = random_forest(X_2D_likert_combined_bias,
                                                  y_likert_combined,
                                                  likert_combined_idx,
                                                  likert_combined_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_combined_likert)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_combined_likert[i],
                                               random_forest_acc_combined_likert[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on combined likert labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.1290)
  Starting inner fold 2 / 2 (acc: 0.1228)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.1290)
  Starting inner fold 2 / 2 (acc: 0.1754)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.1935)
  Starting inner fold 2 / 2 (acc: 0.1754)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.1290)
  Starting inner fold 2 / 2 (acc: 0.1579)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.0645)
  Starting inner fold 2 / 2 (acc: 0.1404)
 => mean acc: 0.1417

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.1129)
  Starting inner fold 2 / 2 (acc: 0.1053)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.1613)
  Starting inner fold 2 / 2 (acc: 0.1404)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.1129)
  Startin

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.592102,,,,,
2,500,0.222193,0.308721,,,,
3,750,0.010821,0.628285,0.534881,,,
4,1000,0.389524,0.214384,0.381913,0.823267,,
5,2000,0.194827,0.255459,0.406362,0.318461,0.165202,


In [12]:
print("=== Results of supervised learning on expert dominant labels ===")
random_forest_acc_experts_dominant = random_forest(X_2D_dominant_expert_bias,
                                                   y_dominant_experts, 
                                                   dominant_expert_idx,
                                                   dominant_expert_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_experts_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_experts_dominant[i],
                                               random_forest_acc_experts_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on expert dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.6087)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5417)
  Starting inner fold 2 / 2 (acc: 0.6957)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.6522)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.5833)
  Starting inner fold 2 / 2 (acc: 0.6522)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.5833)
  Starting inner fold 2 / 2 (acc: 0.6957)
 => mean acc: 0.6013

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.5417)
  Starting inner fold 2 / 2 (acc: 0.6957)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5833)
  Starting inner fold 2 / 2 (acc: 0.6522)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5417)
  Startin

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.323699,,,,,
2,500,0.407975,0.687706,,,,
3,750,0.171813,0.263998,0.534881,,,
4,1000,0.451352,0.772292,0.534881,0.30076,,
5,2000,0.263998,0.646934,0.699595,0.108841,0.222823,


In [13]:
print("=== Results of supervised learning on crowd dominant labels ===")
random_forest_acc_crowd_dominant = random_forest(X_2D_dominant_crowd_bias,
                                                 y_dominant_crowd,
                                                 dominant_crowd_idx,
                                                 dominant_crowd_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_crowd_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_crowd_dominant[i],
                                               random_forest_acc_crowd_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on crowd dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.4600)
  Starting inner fold 2 / 2 (acc: 0.5200)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.4200)
  Starting inner fold 2 / 2 (acc: 0.5200)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.4800)
  Starting inner fold 2 / 2 (acc: 0.4800)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.4000)
  Starting inner fold 2 / 2 (acc: 0.4000)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.4400)
  Starting inner fold 2 / 2 (acc: 0.5600)
 => mean acc: 0.4680

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.4000)
  Starting inner fold 2 / 2 (acc: 0.5200)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.4000)
  Starting inner fold 2 / 2 (acc: 0.5000)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.3800)
  Starting

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.482836,,,,,
2,500,0.226023,0.429627,,,,
3,750,0.486235,0.414563,0.463102,,,
4,1000,0.24083,0.271786,0.268087,0.401629,,
5,2000,0.074656,0.201367,0.169724,0.466787,0.534881,


In [14]:
print("=== Results of supervised learning on combined dominant labels ===")
random_forest_acc_combined_dominant = random_forest(X_2D_dominant_combined_bias,
                                                    y_dominant_combined,
                                                    dominant_combined_idx,
                                                    dominant_combined_splits)
table = {'p-values': N_ESTIMATORS}
table.update({est: list() for est in N_ESTIMATORS})
nhypotheses = len(random_forest_acc_combined_dominant)
for i in range(nhypotheses):
    for e in range(i+1):
        table[N_ESTIMATORS[i]].append(np.nan)
    for j in range(i+1, nhypotheses):
        f, p, mean, variance = alpaydin_F_test(random_forest_acc_combined_dominant[i],
                                               random_forest_acc_combined_dominant[j])
        table[N_ESTIMATORS[i]].append(p)
        #print("RF {} vs {} estimators".format(N_ESTIMATORS[i], N_ESTIMATORS[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on combined dominant labels ===
Training with 100 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.4107)
  Starting inner fold 2 / 2 (acc: 0.4545)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.4643)
  Starting inner fold 2 / 2 (acc: 0.4909)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.5273)
 Starting outer fold 4 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.5091)
 Starting outer fold 5 / 5
  Starting inner fold 1 / 2 (acc: 0.5179)
  Starting inner fold 2 / 2 (acc: 0.5455)
 => mean acc: 0.4920

Training with 250 estimators
 Starting outer fold 1 / 5
  Starting inner fold 1 / 2 (acc: 0.4643)
  Starting inner fold 2 / 2 (acc: 0.4727)
 Starting outer fold 2 / 5
  Starting inner fold 1 / 2 (acc: 0.5000)
  Starting inner fold 2 / 2 (acc: 0.4727)
 Starting outer fold 3 / 5
  Starting inner fold 1 / 2 (acc: 0.4643)
  Start

Unnamed: 0,p-values,100,250,500,750,1000,2000
0,100,,,,,,
1,250,0.25415,,,,,
2,500,0.050211,0.696442,,,,
3,750,0.070223,0.495411,0.410572,,,
4,1000,0.017025,0.609916,0.534881,0.534881,,
5,2000,0.107392,0.65604,0.296335,0.403768,0.657462,


# SGD

In [15]:
from sklearn.linear_model import PassiveAggressiveClassifier

def pac(X, y, index, splits):
    n_samples = X[index].shape[0]
    acc_lst = list()
    for fold_i in range(5):
        print(" Starting outer fold {} / {}".format(fold_i+1, 5), end='')
        acc_inner = list()
        split_a_idx, split_b_idx  = splits[fold_i]
        for fold_j in range(2):
            print("  Starting inner fold {} / {}".format(fold_j+1, 2), end='')
            if fold_j % 2 == 0:
                train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
            else:
                train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx

            train_idx = index[train_fold_idx]
            test_idx = index[test_fold_idx]

            model = PassiveAggressiveClassifier(max_iter=2000, warm_start=False)
            model.fit(X[train_idx], y[train_idx])
 
            y_pred = model.predict(X[test_idx])
            fold_acc = accuracy_score(y[test_idx], y_pred)

            acc_inner.append(fold_acc)
            print(" (acc: {:.4f})".format(fold_acc))
            
        acc_lst.append(np.array(acc_inner))
    print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))
    
    return acc_lst

In [16]:
print("=== Results of supervised learning on expert likert labels ===")
pac_acc_experts_likert = pac(X_2D_likert_expert_bias,
                                y_likert_experts, 
                                likert_expert_idx,
                                likert_expert_splits)

print("=== Results of supervised learning on crowd likert labels ===")
pac_acc_crowd_likert = pac(X_2D_likert_crowd_bias,
                              y_likert_crowd,
                              likert_crowd_idx,
                              likert_crowd_splits)

print("=== Results of supervised learning on combined likert labels ===")
pac_acc_combined_likert = pac(X_2D_dominant_combined_bias,
                                 y_likert_combined,
                                 likert_combined_idx,
                                 likert_combined_splits)

print("=== Results of supervised learning on expert dominant labels ===")
pac_acc_experts_dominant = pac(X_2D_dominant_expert_bias,
                                  y_dominant_experts, 
                                  dominant_expert_idx,
                                  dominant_expert_splits)

print("=== Results of supervised learning on crowd dominant labels ===")
pac_acc_crowd_dominant = pac(X_2D_dominant_crowd_bias,
                                y_dominant_crowd,
                                dominant_crowd_idx,
                                dominant_crowd_splits)

print("=== Results of supervised learning on combined dominant labels ===")
pac_acc_combined_dominant = pac(X_2D_dominant_combined_bias,
                                   y_dominant_combined,
                                   dominant_combined_idx,
                                   dominant_combined_splits)

pac_acc = [pac_acc_experts_likert, pac_acc_crowd_likert, pac_acc_combined_likert,
           pac_acc_experts_dominant, pac_acc_crowd_dominant, pac_acc_combined_dominant]

print("= p-values =")
table = {'p-values': labels}
table.update({lab: list() for lab in labels})
nlabels = len(labels)
for i in range(nlabels):
    for e in range(i+1):
        table[labels[i]].append(np.nan)
    for j in range(i+1, nlabels):
        f, p, mean, variance = alpaydin_F_test(pac_acc[i],
                                               pac_acc[j])
        table[labels[i]].append(p)
        #print("RF {} vs {} estimators".format(pac_acc[i], pac_acc[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

=== Results of supervised learning on expert likert labels ===
 Starting outer fold 1 / 5  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 2 / 5  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 3 / 5  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 4 / 5  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 Starting outer fold 5 / 5  Starting inner fold 1 / 2 (acc: 0.1875)
  Starting inner fold 2 / 2 (acc: 0.1923)
 => mean acc: 0.1899

=== Results of supervised learning on crowd likert labels ===
 Starting outer fold 1 / 5  Starting inner fold 1 / 2 (acc: 0.2097)
  Starting inner fold 2 / 2 (acc: 0.1579)
 Starting outer fold 2 / 5  Starting inner fold 1 / 2 (acc: 0.2258)
  Starting inner fold 2 / 2 (acc: 0.1579)
 Starting outer fold 3 / 5  Starting inner fold 1 / 2 (acc: 0.1613)
  Starting inne

Unnamed: 0,p-values,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,,,,,,
1,crowd_likert,0.470623,,,,,
2,combined_likert,0.052602,0.582789,,,,
3,expert_dominant,0.017327,0.004232,0.017271,,,
4,crowd_dominant,4.1e-05,0.000347,3e-05,0.609718,,
5,combined_dominant,0.000699,0.000368,0.000815,0.372802,0.646651,


## incremental learning

# PyTorch Preparations

In [17]:
## convert numpy arrays to PyTorch tensors
X_2D_likert_crowd_bias = torch.from_numpy(X_2D_likert_crowd_bias)
X_2D_likert_expert_bias = torch.from_numpy(X_2D_likert_expert_bias)
X_2D_likert_combined_bias = torch.from_numpy(X_2D_likert_combined_bias)
X_2D_dominant_crowd_bias = torch.from_numpy(X_2D_dominant_crowd_bias)
X_2D_dominant_expert_bias = torch.from_numpy(X_2D_dominant_expert_bias)
X_2D_dominant_combined_bias = torch.from_numpy(X_2D_dominant_combined_bias)

X_3D_likert_crowd_bias = torch.from_numpy(X_3D_likert_crowd_bias)
X_3D_likert_expert_bias = torch.from_numpy(X_3D_likert_expert_bias)
X_3D_likert_combined_bias = torch.from_numpy(X_3D_likert_combined_bias)
X_3D_dominant_crowd_bias = torch.from_numpy(X_3D_dominant_crowd_bias)
X_3D_dominant_expert_bias = torch.from_numpy(X_3D_dominant_expert_bias)
X_3D_dominant_combined_bias = torch.from_numpy(X_3D_dominant_combined_bias)

y_likert_crowd = torch.from_numpy(y_likert_crowd)
y_likert_experts = torch.from_numpy(y_likert_experts)
y_likert_combined = torch.from_numpy(y_likert_combined)
y_dominant_crowd = torch.from_numpy(y_dominant_crowd)
y_dominant_experts = torch.from_numpy(y_dominant_experts)
y_dominant_combined = torch.from_numpy(y_dominant_combined)

In [18]:
def categorical_accuracy(y_hat, y):
    # y := 1D array of class labels
    # y_hat := 2D array of one-hot class labels
    _, labels = y_hat.max(dim=1)
    return torch.mean(torch.eq(labels, y).float())

def fit(model, X, y, index, splits, lr=0.01, l2norm=0.01, n_epoch=250, patience=-1):
    n_samples = X[index].shape[0]
    acc_lst = list()
    for fold_i in range(5):
        print("Starting outer fold {} / {}".format(fold_i+1, 5))
        acc_inner = list()
        split_a_idx, split_b_idx  = splits[fold_i]

        for fold_j in range(2):
            print(" Starting inner fold {} / {}".format(fold_j+1, 2), end='')
            if fold_j % 2 == 0:
                train_fold_idx, test_fold_idx  = split_a_idx, split_b_idx
            else:
                train_fold_idx, test_fold_idx  = split_b_idx, split_a_idx
                
            train_idx = index[train_fold_idx]
            test_idx = index[test_fold_idx]
            
            model.init()
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
            criterion = nn.CrossEntropyLoss()

            # early stopping
            patience_left = patience
            best_fold_score = -1
            delta = 1e-4
            best_fold_state = None
            best_fold_state_opt = None
        
            for epoch in range(n_epoch):
                model.train()

                y_hat = model(X[train_idx].float())
                train_acc = categorical_accuracy(y_hat, y[train_idx])
                train_loss = criterion(y_hat, y[train_idx].long())
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                model.eval()
                test_loss = None
                with torch.no_grad():
                    y_hat = model(X[test_idx].float())
                    test_acc = categorical_accuracy(y_hat, y[test_idx])
                    test_loss = criterion(y_hat, y[test_idx].long())

                train_loss = float(train_loss.item())
                test_loss = float(test_loss.item())
                
                if best_fold_score < 0:
                    best_fold_score = test_loss
                    best_fold_state = model.state_dict()
                    best_fold_state_opt = optimizer.state_dict()

                if patience <= 0:
                    continue
                if test_loss >= best_fold_score - delta:
                    patience_left -= 1
                else:
                    best_fold_score = test_loss
                    best_fold_state = model.state_dict()
                    best_fold_state_opt = optimizer.state_dict()
                    patience_left = patience
                if patience_left <= 0:
                    model.load_state_dict(best_fold_state)
                    optimizer.load_state_dict(best_fold_state_opt)
                    break
            
            # do a final run over the test set after loading a previous state
            with torch.no_grad():
                y_hat = model(X[test_idx].float())
                test_acc = categorical_accuracy(y_hat, y[test_idx])
                test_loss = criterion(y_hat, y[test_idx].long())
            
            test_loss = float(test_loss.item())
            print(" (acc: {:.4f})".format(test_acc))
            acc_inner.append(test_acc)
    
        acc_lst.append(np.array(acc_inner))
        
    print(" => mean acc: {:.4f}\n".format(np.mean(np.array([np.mean(inner) for inner in acc_lst]))))

    return acc_lst

# Neural Network

In [19]:
class ClassifierNN(nn.Module):
    """Simple Neural Network Classifier"""

    def __init__(self, input_dim, output_dim, p_dropout=0.05):
        super().__init__()
        hidden_dim = (input_dim-output_dim)//2
        
        self.layers = nn.ModuleList()
        self.layers.append(nn.Sequential(
                            nn.Linear(input_dim, hidden_dim),
                            nn.ReLU(inplace=True),
                            nn.Dropout(p=p_dropout)))
            
        self.layers.append(nn.Sequential(
                            nn.Linear(hidden_dim, output_dim),
                            nn.ReLU(inplace=True)))
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, X):
        for layer in self.layers:
            X = layer(X)                          
                           
        return self.softmax(X)
        
    def init(self):
        for param in self.parameters():
            nn.init.normal_(param)

In [20]:
## hyperparameters
lr = 0.01
n_epoch = 250
p_dropout = 0.05

## define model
indim = X_2D_likert_combined_bias.shape[1]
outdim = np.unique(y_likert_experts[likert_expert_idx]).shape[0]
assert outdim == np.unique(y_likert_crowd[likert_crowd_idx]).shape[0]

model = ClassifierNN(input_dim=indim,
                     output_dim=outdim,
                     p_dropout=p_dropout)

print("=== Results on expert likert labels ===")
neural_net_acc_likert_experts = fit(model, X_2D_likert_expert_bias, y_likert_experts, likert_expert_idx, likert_expert_splits, lr=lr, n_epoch=n_epoch)

print("\n=== Results on crowd likert labels ===")
neural_net_acc_likert_crowd = fit(model, X_2D_likert_crowd_bias, y_likert_crowd, likert_crowd_idx, likert_crowd_splits, lr=lr, n_epoch=n_epoch)

print("\n=== Results on combined likert labels ===")
neural_net_acc_likert_combined = fit(model, X_2D_likert_combined_bias, y_likert_combined, likert_combined_idx, likert_combined_splits, lr=lr, n_epoch=n_epoch)

=== Results on expert likert labels ===
Starting outer fold 1 / 5
 Starting inner fold 1 / 2 (acc: 0.1250)
 Starting inner fold 2 / 2 (acc: 0.2308)
Starting outer fold 2 / 5
 Starting inner fold 1 / 2 (acc: 0.1250)
 Starting inner fold 2 / 2 (acc: 0.1538)
Starting outer fold 3 / 5
 Starting inner fold 1 / 2 (acc: 0.0625)
 Starting inner fold 2 / 2 (acc: 0.2692)
Starting outer fold 4 / 5
 Starting inner fold 1 / 2 (acc: 0.0938)
 Starting inner fold 2 / 2 (acc: 0.2308)
Starting outer fold 5 / 5
 Starting inner fold 1 / 2 (acc: 0.1562)
 Starting inner fold 2 / 2 (acc: 0.1923)
 => mean acc: 0.1639


=== Results on crowd likert labels ===
Starting outer fold 1 / 5
 Starting inner fold 1 / 2 (acc: 0.1613)
 Starting inner fold 2 / 2 (acc: 0.1579)
Starting outer fold 2 / 5
 Starting inner fold 1 / 2 (acc: 0.1613)
 Starting inner fold 2 / 2 (acc: 0.2281)
Starting outer fold 3 / 5
 Starting inner fold 1 / 2 (acc: 0.1452)
 Starting inner fold 2 / 2 (acc: 0.1053)
Starting outer fold 4 / 5
 Startin

In [21]:
## hyperparameters
lr = 0.01
n_epoch = 250
p_dropout = 0.1

## define model
indim = X_2D_dominant_combined_bias.shape[1]
outdim = np.unique(y_dominant_experts[dominant_expert_idx]).shape[0]
assert outdim == np.unique(y_dominant_crowd[dominant_crowd_idx]).shape[0]

model = ClassifierNN(input_dim=indim,
                     output_dim=outdim,
                     p_dropout=p_dropout)

print("=== Results on expert dominant labels ===")
neural_net_acc_dominant_experts = fit(model, X_2D_dominant_expert_bias, y_dominant_experts, dominant_expert_idx, dominant_expert_splits, lr=lr, n_epoch=n_epoch)

print("\n=== Results on crowd dominant labels ===")
neural_net_acc_dominant_crowd = fit(model, X_2D_dominant_crowd_bias, y_dominant_crowd, dominant_crowd_idx, dominant_crowd_splits, lr=lr, n_epoch=n_epoch)

print("\n=== Results on combined dominant labels ===")
neural_net_acc_dominant_combined = fit(model, X_2D_dominant_combined_bias, y_dominant_combined, dominant_combined_idx, dominant_combined_splits, lr=lr, n_epoch=n_epoch)

=== Results on expert dominant labels ===
Starting outer fold 1 / 5
 Starting inner fold 1 / 2 (acc: 0.5417)
 Starting inner fold 2 / 2 (acc: 0.4783)
Starting outer fold 2 / 5
 Starting inner fold 1 / 2 (acc: 0.5417)
 Starting inner fold 2 / 2 (acc: 0.5217)
Starting outer fold 3 / 5
 Starting inner fold 1 / 2 (acc: 0.6250)
 Starting inner fold 2 / 2 (acc: 0.5217)
Starting outer fold 4 / 5
 Starting inner fold 1 / 2 (acc: 0.5417)
 Starting inner fold 2 / 2 (acc: 0.5652)
Starting outer fold 5 / 5
 Starting inner fold 1 / 2 (acc: 0.5417)
 Starting inner fold 2 / 2 (acc: 0.5217)
 => mean acc: 0.5400


=== Results on crowd dominant labels ===
Starting outer fold 1 / 5
 Starting inner fold 1 / 2 (acc: 0.5600)
 Starting inner fold 2 / 2 (acc: 0.4400)
Starting outer fold 2 / 5
 Starting inner fold 1 / 2 (acc: 0.4200)
 Starting inner fold 2 / 2 (acc: 0.4600)
Starting outer fold 3 / 5
 Starting inner fold 1 / 2 (acc: 0.4200)
 Starting inner fold 2 / 2 (acc: 0.4800)
Starting outer fold 4 / 5
 Sta

In [22]:
nn_acc = [neural_net_acc_likert_experts, neural_net_acc_likert_crowd, neural_net_acc_likert_combined,
          neural_net_acc_dominant_experts, neural_net_acc_dominant_crowd, neural_net_acc_dominant_combined]

print("= p-values =")
table = {'p-values': labels}
table.update({lab: list() for lab in labels})
nlabels = len(labels)
for i in range(nlabels):
    for e in range(i+1):
        table[labels[i]].append(np.nan)
    for j in range(i+1, nlabels):
        f, p, mean, variance = alpaydin_F_test(nn_acc[i],
                                               nn_acc[j])
        table[labels[i]].append(p)
        #print("RF {} vs {} estimators".format(nn_acc[i], nn_acc[j]))
        #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

significance = pd.DataFrame(table)
display(significance)

= p-values =


Unnamed: 0,p-values,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,,,,,,
1,crowd_likert,0.761043,,,,,
2,combined_likert,0.802733,0.802535,,,,
3,expert_dominant,0.009186,7e-05,0.000298,,,
4,crowd_dominant,0.009907,0.000866,0.009762,0.165788,,
5,combined_dominant,0.004603,0.000515,0.000724,0.278232,0.324219,


In [23]:
rf_acc = [random_forest_acc_experts_likert, random_forest_acc_crowd_likert, random_forest_acc_combined_likert,
           random_forest_acc_experts_dominant, random_forest_acc_crowd_dominant, random_forest_acc_combined_dominant]

nhypotheses = len(N_ESTIMATORS)
for k in range(nhypotheses):
    table = {'p-values - {} estimators'.format(N_ESTIMATORS[k]): labels}
    table.update({lab: list() for lab in labels})

    nlabels = len(labels)
    for i in range(nlabels):
        for e in range(nlabels):
            if e != i:
                table[labels[i]].append(np.nan)
            else:
                f, p, mean, variance = alpaydin_F_test(nn_acc[i],
                                                       rf_acc[j][k])
                table[labels[i]].append(p)
                #print("RF {} vs {} estimators".format(nn_acc[i], nn_acc[j]))
                #print(" f: {:.4f}, p: {:.4f}, mean: {:.4f}, var: {:.4f}".format(f, p, mean, variance))

    significance = pd.DataFrame(table)
    display(significance)

Unnamed: 0,p-values - 100 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.001755,,,,,
1,crowd_likert,,2.1e-05,,,,
2,combined_likert,,,0.002268,,,
3,expert_dominant,,,,0.374312,,
4,crowd_dominant,,,,,0.360185,
5,combined_dominant,,,,,,0.702479


Unnamed: 0,p-values - 250 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.001774,,,,,
1,crowd_likert,,0.000176,,,,
2,combined_likert,,,0.000826,,,
3,expert_dominant,,,,0.234739,,
4,crowd_dominant,,,,,0.648495,
5,combined_dominant,,,,,,0.23877


Unnamed: 0,p-values - 500 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.003497,,,,,
1,crowd_likert,,1.6e-05,,,,
2,combined_likert,,,0.002808,,,
3,expert_dominant,,,,0.232779,,
4,crowd_dominant,,,,,0.602719,
5,combined_dominant,,,,,,0.517162


Unnamed: 0,p-values - 750 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.003551,,,,,
1,crowd_likert,,5e-06,,,,
2,combined_likert,,,0.001721,,,
3,expert_dominant,,,,0.170302,,
4,crowd_dominant,,,,,0.503998,
5,combined_dominant,,,,,,0.281369


Unnamed: 0,p-values - 1000 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.002145,,,,,
1,crowd_likert,,8.7e-05,,,,
2,combined_likert,,,0.001762,,,
3,expert_dominant,,,,0.213384,,
4,crowd_dominant,,,,,0.701554,
5,combined_dominant,,,,,,0.435686


Unnamed: 0,p-values - 2000 estimators,expert_likert,crowd_likert,combined_likert,expert_dominant,crowd_dominant,combined_dominant
0,expert_likert,0.004477,,,,,
1,crowd_likert,,3.5e-05,,,,
2,combined_likert,,,0.001609,,,
3,expert_dominant,,,,0.158725,,
4,crowd_dominant,,,,,0.595507,
5,combined_dominant,,,,,,0.534881
