In [1]:
#%pip install sklearn
#%pip install torch

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from collections import namedtuple
from math import sqrt
import os
from time import time
import random
import numpy as np
from scipy import stats
import pandas as pd
import sklearn
from sklearn.base import clone
import torch
import torch.nn as nn
import torch.optim as optim
from IPython.display import display, HTML
from tqdm import tqdm

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/" 
DATA_NPZ = DATA_DIR + "data2021.npz"
RESULTS_FILE = DATA_DIR + 'results2021full.tsv'
OUTPUT_FILE = DATA_DIR + 'output2021full.npz'

## load files
data = np.load(DATA_NPZ)

X_2D_transcriptions = data['X_2D_transcriptions']
X_2D_descriptions = data['X_2D_descriptions']
X_2D_titles = data['X_2D_title']

X_2D = np.hstack([X_2D_titles, X_2D_descriptions, X_2D_transcriptions])

y_crowd = data['y_crowd']
y_experts = data['y_experts']
y_combined = data['y_combined']

# retrieve indices of labeled samples
experts_pilot_idx = np.where(y_experts > -1)[0]  # equal pilot subset

crowd_pilot_idx = np.array([idx for idx in experts_pilot_idx if y_crowd[idx] > -1])
crowd_all_idx = np.where(y_crowd > -1)[0]

# note: combined on pilot idx is same as experts on pilot idx since the experts labels are used for this part
combined_all_idx = np.where(y_combined > -1)[0]

In [2]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return seed
    
print(set_seed())  # make reproducable

3684733569


In [3]:
results = pd.DataFrame()
outputs = list()

Note that the expert labels are part of the combined sets and that we also compute the performance on the expert labels. This means that we are partially learning on the same samples are that we test on, which gives a better performance but which shouldn't 

In [4]:
def create_splits(y, test_ratio=.5):
    train_idx = list()
    test_idx = list()
    
    strats = [np.where(y == lab)[0] for lab in np.unique(y) if lab > -1]
    for strat in strats:
        n = strat.shape[0]
        train_idx.append(strat[:int(n*(1-test_ratio))])
        test_idx.append(strat[int(n*(1-test_ratio)):])
        
    train_idx = np.concatenate(train_idx)
    test_idx = np.concatenate(test_idx)
    
    np.random.shuffle(train_idx)
    np.random.shuffle(test_idx)
    
    return (train_idx, test_idx)

def create_splits_one_hot(y):
    vec = -np.ones(y.shape[0])
    nonzero = y.nonzero()
    vec[nonzero[:,0]] = nonzero[:,1].float()
    
    return create_splits(vec)

# Ridge Classifier

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

def ridge_classifier(X, y, index):
    N_FOLDS = 100
    n_samples = X.shape[0]   
    X = X[index]
    y = y[index]
    
    scores = np.zeros(N_FOLDS)
    output = -np.ones((N_FOLDS, n_samples))
    for fold_i in range(N_FOLDS):
        train_split, test_split = create_splits(y, 0.2)
        
        model = RidgeClassifier().fit(X[train_split], y[train_split])
        y_hat = model.predict(X[test_split])
        
        fold_acc = accuracy_score(y[test_split], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, index[test_split]] = y_hat
        
    return (scores, output)

In [6]:
print("=== Results on expert ===")
ridge_acc_experts_pilot, ridge_out_experts_pilot = ridge_classifier(X_2D, y_experts, experts_pilot_idx)
results['ridge_experts_pilot'] = ridge_acc_experts_pilot
outputs.append(ridge_out_experts_pilot)

print("=== Results on crowd pilot ===")
ridge_acc_crowd_pilot, ridge_out_crowd_pilot = ridge_classifier(X_2D, y_crowd, crowd_pilot_idx)
results['ridge_crowd_pilot'] = ridge_acc_crowd_pilot
outputs.append(ridge_out_crowd_pilot)

print("=== Results on crowd all ===")
ridge_acc_crowd_all, ridge_out_crowd_all = ridge_classifier(X_2D, y_crowd, crowd_all_idx)
results['ridge_crowd_all'] = ridge_acc_crowd_all
outputs.append(ridge_out_crowd_all)

print("=== Results on combined all ===")
ridge_acc_combined_all, ridge_out_combined_all = ridge_classifier(X_2D, y_combined, combined_all_idx)
results['ridge_combined_all'] = ridge_acc_combined_all
outputs.append(ridge_out_combined_all)

=== Results on expert ===
=== Results on crowd pilot ===
=== Results on crowd all ===
=== Results on combined all ===


# Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

def naive_bayes_classifier(X, y, index):
    N_FOLDS = 100
    n_samples = X.shape[0]   
    X = X[index]
    y = y[index]
    
    scores = np.zeros(N_FOLDS)
    output = -np.ones((N_FOLDS, n_samples))
    for fold_i in range(N_FOLDS):
        train_split, test_split = create_splits(y, 0.2)
        
        model = GaussianNB().fit(X[train_split], y[train_split])
        y_hat = model.predict(X[test_split])
        
        fold_acc = accuracy_score(y[test_split], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, index[test_split]] = y_hat
        
    return (scores, output)

def naive_bayes_classifier_incremental(X, y, idx_experts, idx_crowd):
    N_FOLDS = 100
    n_samples = y.shape[0]   
    output = -np.ones((N_FOLDS, n_samples))
    scores = np.zeros(N_FOLDS)
    for fold_i in range(N_FOLDS):
        idx_crowd_unique = np.setdiff1d(idx_crowd, idx_experts)
        crowd_num_samples = idx_crowd_unique.shape[0]
        experts_num_samples = idx_experts.shape[0]
        
        experts_train_split = np.random.choice(idx_experts,
                                               int(experts_num_samples*.8),
                                               replace=False)
        experts_test_split = np.setdiff1d(idx_experts, experts_train_split)

        crowd_train_split = np.random.choice(idx_crowd_unique,
                                             int(crowd_num_samples*.8),
                                             replace=False)
        crowd_test_split = np.setdiff1d(idx_crowd_unique, crowd_train_split)
        
        idx_test = np.union1d(experts_test_split, crowd_test_split)
        
        with np.errstate(divide='ignore'):
            model = GaussianNB().partial_fit(X[experts_train_split], y[experts_train_split], np.unique(y_combined[idx_experts]))
            for i in crowd_train_split:
                model.partial_fit([X[i]], [y[i]])
            
            y_hat = model.predict(X[idx_test])
        
        fold_acc = accuracy_score(y[idx_test], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, idx_test] = y_hat
        
    return (scores, output)

In [8]:
print("=== Results on expert pilot ===")
bayes_acc_experts_pilot, bayes_out_experts_pilot = naive_bayes_classifier(X_2D, y_experts, experts_pilot_idx)
results['bayes_experts_pilot'] = bayes_acc_experts_pilot
outputs.append(bayes_out_experts_pilot)

print("=== Results on crowd pilot ===")
bayes_acc_crowd_pilot, bayes_out_crowd_pilot = naive_bayes_classifier(X_2D, y_crowd, crowd_pilot_idx)
results['bayes_crowd_pilot'] = bayes_acc_crowd_pilot
outputs.append(bayes_out_crowd_pilot)

print("=== Results on crowd all ===")
bayes_acc_crowd_all, bayes_out_crowd_all = naive_bayes_classifier(X_2D, y_crowd, crowd_all_idx)
results['bayes_crowd_all'] = bayes_acc_crowd_all
outputs.append(bayes_out_crowd_all)

print("=== Results on combined all ===")
bayes_acc_combined_all, bayes_out_combined_all = naive_bayes_classifier(X_2D, y_combined, combined_all_idx)
results['bayes_combined_all'] = bayes_acc_combined_all
outputs.append(bayes_out_combined_all)

=== Results on expert pilot ===
=== Results on crowd pilot ===
=== Results on crowd all ===
=== Results on combined all ===


In [9]:
# incremental learning
print("=== Results on incremental learning ===")
bayes_acc_inc, bayes_out_inc = naive_bayes_classifier_incremental(X_2D, y_combined, experts_pilot_idx, crowd_all_idx)
results['bayes_inc'] = bayes_acc_inc
outputs.append(bayes_out_inc)

=== Results on incremental learning ===


# random forrest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


N_ESTIMATORS = [100, 250, 500, 750, 1000, 2000]

def random_forest_classifier(X, y, index, n_estimators):
    N_FOLDS = 100
    n_samples = X.shape[0]   

    X = X[index]
    y = y[index]
    
    output = -np.ones((N_FOLDS, n_samples))
    scores = np.zeros(N_FOLDS)
    for fold_i in tqdm(range(N_FOLDS), total=N_FOLDS):
        train_split, test_split = create_splits(y, 0.2)
        
        model = RandomForestClassifier(n_estimators=n_estimators)
        model.fit(X[train_split], y[train_split])
        y_hat = model.predict(X[test_split])
        
        fold_acc = accuracy_score(y[test_split], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, index[test_split]] = y_hat
        
    return (scores, output)

In [11]:
for n_estimators in N_ESTIMATORS:
    print("\n\nnumber of estimators: %d =================================\n" % n_estimators)
    
    print("=== Results on expert pilot ===", flush=True)
    forest_acc_experts_pilot, forest_out_experts_pilot = random_forest_classifier(X_2D, y_experts, experts_pilot_idx, n_estimators)
    results['forest_experts_pilot_%d' % n_estimators] = forest_acc_experts_pilot
    outputs.append(forest_out_experts_pilot)

    print("=== Results on crowd pilot ===", flush=True)
    forest_acc_crowd_pilot, forest_out_crowd_pilot = random_forest_classifier(X_2D, y_crowd, crowd_pilot_idx, n_estimators)
    results['forest_crowd_pilot_%d' % n_estimators] = forest_acc_crowd_pilot
    outputs.append(forest_out_crowd_pilot)

    print("=== Results on crowd all ===", flush=True)
    forest_acc_crowd_all, forest_out_crowd_all = random_forest_classifier(X_2D, y_crowd, crowd_all_idx, n_estimators)
    results['forest_crowd_all_%d' % n_estimators] = forest_acc_crowd_all
    outputs.append(forest_out_crowd_all)

    print("=== Results on combined all ===", flush=True)
    forest_acc_combined_all, forest_out_combined_all = random_forest_classifier(X_2D, y_combined, combined_all_idx, n_estimators)
    results['forest_combined_all_%d' % n_estimators] =  forest_acc_combined_all
    outputs.append(forest_out_combined_all)




=== Results on expert pilot ===


100%|██████████| 100/100 [00:30<00:00,  3.24it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:30<00:00,  3.27it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:32<00:00,  3.12it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:31<00:00,  3.14it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [01:15<00:00,  1.33it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [01:14<00:00,  1.35it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [01:18<00:00,  1.27it/s]

=== Results on combined all ===



100%|██████████| 100/100 [01:17<00:00,  1.29it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [02:27<00:00,  1.48s/it]

=== Results on crowd pilot ===



100%|██████████| 100/100 [01:48<00:00,  1.09s/it]

=== Results on crowd all ===



100%|██████████| 100/100 [01:32<00:00,  1.09it/s]

=== Results on combined all ===



100%|██████████| 100/100 [01:35<00:00,  1.05it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [03:21<00:00,  2.01s/it]

=== Results on crowd pilot ===



100%|██████████| 100/100 [02:23<00:00,  1.43s/it]

=== Results on crowd all ===



100%|██████████| 100/100 [02:10<00:00,  1.30s/it]

=== Results on combined all ===



100%|██████████| 100/100 [02:04<00:00,  1.24s/it]




=== Results on expert pilot ===



100%|██████████| 100/100 [02:33<00:00,  1.53s/it]

=== Results on crowd pilot ===



100%|██████████| 100/100 [02:33<00:00,  1.53s/it]

=== Results on crowd all ===



100%|██████████| 100/100 [02:44<00:00,  1.65s/it]

=== Results on combined all ===



100%|██████████| 100/100 [02:44<00:00,  1.65s/it]




=== Results on expert pilot ===



100%|██████████| 100/100 [05:08<00:00,  3.08s/it]

=== Results on crowd pilot ===



100%|██████████| 100/100 [05:07<00:00,  3.08s/it]

=== Results on crowd all ===



100%|██████████| 100/100 [05:29<00:00,  3.29s/it]

=== Results on combined all ===



100%|██████████| 100/100 [05:33<00:00,  3.34s/it]


# SVM

In [12]:
from sklearn import svm
from sklearn.metrics import accuracy_score

def svmc(X, y, index, kernel='linear'):
    N_FOLDS = 100
    n_samples = X.shape[0]   
    X = X[index]
    y = y[index]
    
    output = -np.ones((N_FOLDS, n_samples))
    scores = np.zeros(N_FOLDS)
    for fold_i in tqdm(range(N_FOLDS), total=N_FOLDS):
        train_split, test_split = create_splits(y, 0.2)
        
        model = svm.SVC(kernel=kernel).fit(X[train_split], y[train_split])
        y_hat = model.predict(X[test_split])
        
        fold_acc = accuracy_score(y[test_split], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, index[test_split]] = y_hat
        
    return (scores, output)

In [13]:
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    print("\n\nkernel: %s =================================\n" % kernel, flush=True)
    
    print("=== Results on expert pilot ===", flush=True)
    svm_acc_experts_pilot, svm_out_experts_pilot = svmc(X_2D, y_experts, experts_pilot_idx, kernel)
    results['svm_experts_pilot_%s' % kernel] = svm_acc_experts_pilot
    outputs.append(svm_out_experts_pilot)

    print("=== Results on crowd pilot ===", flush=True)
    svm_acc_crowd_pilot, svm_out_crowd_pilot = svmc(X_2D, y_crowd, crowd_pilot_idx, kernel)
    results['svm_crowd_pilot_%s' % kernel] = svm_acc_crowd_pilot
    outputs.append(svm_out_crowd_pilot)

    print("=== Results on crowd all ===", flush=True)
    svm_acc_crowd_all, svm_out_crowd_all = svmc(X_2D, y_crowd, crowd_all_idx, kernel)
    results['svm_crowd_all_%s' % kernel] = svm_acc_crowd_all
    outputs.append(svm_out_crowd_all)

    print("=== Results on combined all ===", flush=True)
    svm_acc_combined_all, svm_out_combined_all = svmc(X_2D, y_combined, combined_all_idx, kernel)
    results['svm_combined_all_%s' % kernel] = svm_acc_combined_all
    outputs.append(svm_out_combined_all)




=== Results on expert pilot ===


100%|██████████| 100/100 [00:00<00:00, 405.86it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:00<00:00, 532.03it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:02<00:00, 37.81it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:01<00:00, 51.53it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [00:00<00:00, 751.92it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:00<00:00, 751.69it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:00<00:00, 460.84it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:00<00:00, 426.23it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [00:00<00:00, 752.98it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:00<00:00, 732.81it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:00<00:00, 465.43it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:00<00:00, 541.97it/s]




=== Results on expert pilot ===



100%|██████████| 100/100 [00:00<00:00, 730.36it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:00<00:00, 729.63it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:00<00:00, 437.65it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:00<00:00, 425.29it/s]


# neural network

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

N_EPOCH = 1000
    
def nn(X, y, index):
    N_FOLDS = 100
    n_samples = X.shape[0]   
    X = X[index]
    y = y[index]
    
    output = -np.ones((N_FOLDS, n_samples))
    scores = np.zeros(N_FOLDS)
    for fold_i in tqdm(range(N_FOLDS), total=N_FOLDS):
        train_split, test_split = create_splits(y, 0.2)
        
        model = MLPClassifier(solver='adam', alpha=1e-5, 
                              learning_rate='adaptive',
                              max_iter=N_EPOCH,
                              hidden_layer_sizes=(20, 8))
        model.fit(X[train_split], y[train_split])
        y_hat = model.predict(X[test_split])
        
        fold_acc = accuracy_score(y[test_split], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, index[test_split]] = y_hat
        
    return (scores, output)

def nn_incremental(X, y, idx_experts, idx_crowd):
    N_FOLDS = 100    
    
    n_samples = y.shape[0]   
    output = -np.ones((N_FOLDS, n_samples))
    scores = np.zeros(N_FOLDS)
    for fold_i in tqdm(range(N_FOLDS), total=N_FOLDS):
        idx_crowd_unique = np.setdiff1d(idx_crowd, idx_experts)
        crowd_num_samples = idx_crowd_unique.shape[0]
        experts_num_samples = idx_experts.shape[0]
        
        experts_train_split = np.random.choice(idx_experts,
                                               int(experts_num_samples*.8),
                                               replace=False)
        experts_test_split = np.setdiff1d(idx_experts, experts_train_split)

        crowd_train_split = np.random.choice(idx_crowd_unique,
                                             int(crowd_num_samples*.8),
                                             replace=False)
        crowd_test_split = np.setdiff1d(idx_crowd_unique, crowd_train_split)
        
        idx_test = np.union1d(experts_test_split, crowd_test_split)
        
        with np.errstate(divide='ignore'):
            model = MLPClassifier(solver='adam', alpha=1e-5, 
                              learning_rate='adaptive',
                              max_iter=N_EPOCH,
                              hidden_layer_sizes=(20, 8))       
            model.fit(X[experts_train_split], y[experts_train_split])
            for i in range(N_EPOCH):
                model.partial_fit(X[crowd_train_split], y[crowd_train_split], np.unique(y[idx_experts]))
            
            y_hat = model.predict(X[idx_test])
        
        fold_acc = accuracy_score(y[idx_test], y_hat)
        scores[fold_i] = fold_acc
        output[fold_i, idx_test] = y_hat
        
    return (scores, output)

In [15]:
print("=== Results on expert pilot ===", flush=True)
nn_experts_pilot, nn_out_experts_pilot = nn(X_2D, y_experts, experts_pilot_idx)
results['nn_experts_pilot'] = nn_experts_pilot
outputs.append(nn_out_experts_pilot)

print("=== Results on crowd pilot ===", flush=True)
nn_crowd_pilot, nn_out_crowd_pilot = nn(X_2D, y_crowd, crowd_pilot_idx)
results['nn_crowd_pilot'] = nn_crowd_pilot
outputs.append(nn_out_crowd_pilot)

print("=== Results on crowd all ===", flush=True)
nn_crowd_all, nn_out_crowd_all = nn(X_2D, y_crowd, crowd_all_idx)
results['nn_crowd_all'] = nn_crowd_all
outputs.append(nn_out_crowd_all)

print("=== Results on combined all ===", flush=True)
nn_combined_all, nn_out_combined_all = nn(X_2D, y_combined, combined_all_idx)
results['nn_combined_all'] = nn_combined_all
outputs.append(nn_out_combined_all)

=== Results on expert pilot ===


100%|██████████| 100/100 [00:26<00:00,  3.75it/s]

=== Results on crowd pilot ===



100%|██████████| 100/100 [00:26<00:00,  3.72it/s]

=== Results on crowd all ===



100%|██████████| 100/100 [00:35<00:00,  2.81it/s]

=== Results on combined all ===



100%|██████████| 100/100 [00:36<00:00,  2.78it/s]


In [16]:
# incremental learning
print("=== Results on incremental learning ===", flush=True)
nn_inc, nn_out_inc = nn_incremental(X_2D, y_combined, experts_pilot_idx, crowd_all_idx)
results['nn_inc'] = nn_inc
outputs.append(nn_out_inc)

=== Results on incremental learning ===


100%|██████████| 100/100 [03:25<00:00,  2.06s/it]


# save results

In [17]:
#results.to_csv(RESULTS_FILE, sep='\t', index=False)  # tsv

In [18]:
outputs = np.array(outputs)
np.savez_compressed(OUTPUT_FILE, predictions = outputs)