In [None]:
!pip install sklearn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#cost-based parameters
Vr = 0.0
Vc = 1.0

Vw_list_fn = list(np.arange(0, -10.1, -0.1))
Vw_list_fp = list(np.arange(0, -10.1, -0.1))

confT_list = list(np.arange(0, 1.02, 0.01))

In [None]:
# example code for clickbait dataset
path = '.../clickbait/'
res_path = '.../clickbait/res/'
resFileName1 = "clickbait-model1_vectorized"
resFileName2 = "clickbait-model2_vectorized"
resFileName3 = "clickbait-model3_vectorized"
resFileName4 = "clickbait-model4_vectorized"

proba_val_pd_1 = pd.read_csv(path + 'logits/logits_validation_1.csv', header=None)
proba_val_1 = proba_val_pd_1.to_numpy()
proba_test_pd_1 = pd.read_csv(path + 'logits/logits_test_1.csv', header=None)
proba_test_1 = proba_test_pd_1.to_numpy()

proba_val_pd_2 = pd.read_csv(path + 'logits/logits_validation_2.csv', header=None)
proba_val_2 = proba_val_pd_2.to_numpy()
proba_test_pd_2 = pd.read_csv(path + 'logits/logits_test_2.csv', header=None)
proba_test_2 = proba_test_pd_2.to_numpy()

proba_val_pd_3 = pd.read_csv(path + 'logits/logits_validation_3.csv', header=None)
proba_val_3 = proba_val_pd_3.to_numpy()
proba_test_pd_3 = pd.read_csv(path + 'logits/logits_test_3.csv', header=None)
proba_test_3 = proba_test_pd_3.to_numpy()

proba_val_pd_4 = pd.read_csv(path + 'logits/logits_validation_4.csv', header=None)
proba_val_4 = proba_val_pd_4.to_numpy()
proba_test_pd_4 = pd.read_csv(path + 'logits/logits_test_4.csv', header=None)
proba_test_4 = proba_test_pd_4.to_numpy()

y_val_pd = pd.read_csv(path + 'groundTruth/groundTruth_validation.csv', header=None)
y_val = y_val_pd[0].to_numpy().astype(int)
y_test_pd = pd.read_csv(path + 'groundTruth/groundTruth_test.csv', header=None)
y_test = y_test_pd[0].to_numpy().astype(int)

In [None]:
def cost_based_threshold(k):
    t = (k)/(k+1)
    return t

def calculate_value(y_hat_proba, y, t_fp, V_fp, t_fn, V_fn, Vc, Vr):
    prob_positive = y_hat_proba[:,1]
    prob_negative = y_hat_proba[:,0]

    y_pred_pos = np.full(prob_positive.shape[0],-1) 
    y_pred_neg = np.full(prob_negative.shape[0],-1) 

    y_pred_pos[prob_positive >= t_fp] = 1
    y_pred_neg[prob_negative >= t_fn] = 0

    max_prob_indices = list(np.argmax(y_hat_proba, axis=1))

    y_pred = np.array([y_pred_neg[i] if max_prob_indices[i] == 0 else y_pred_pos[i] for i in range(len(max_prob_indices))])

    # now lets compute the actual value of each prediction
    value_vector = np.full(y_pred.shape[0], Vc)

    #loss due to asking humans
    value_vector[y_pred == -1] = Vr

    #loss due to false positives and false negatives
    false_positives_idx = (y_pred == 1) & ( y == 0)
    false_negatives_idx = (y_pred == 0) & ( y == 1)

    value_vector[false_positives_idx] = V_fp
    value_vector[false_negatives_idx] = V_fn

    value = np.sum(value_vector) / len(y)

    return value

def find_optimum_confidence_threshold(y_hat_proba, y, t_list, Vw_fp, Vw_fn, Vc, Vr):

    cost_list = {}

    for t_fp in t_list:
        for t_fn in t_list:
            # here we define K = fn_c_norm, change it based on task. 
            value = calculate_value(y_hat_proba, y, t_fp, Vw_fp, t_fn, Vw_fn, Vc, Vr)
            cost_list["{}_{}".format(t_fp,t_fn)] = value
    # find t values with maximum value
    maxValue = max(cost_list.values())
    optTList = [[float(k.split('_')[0]),float(k.split('_')[1])] for k, v in cost_list.items() if v == maxValue]
    return optTList[0], cost_list

#cost based calibration analysis
def cost_based_analysis(y_hat_proba_val, y_val, y_hat_proba_test, y_test, res_path, logfile_name, Vr, Vc, Vw_list_fp, Vw_list_fn, confT_list):

    # create log file
    rc_path = res_path + logfile_name + "_costBased_test.csv"
    with open(rc_path, 'w') as f:
        c = 'Vr, Vc, Vw_fp, Vw_fn, k_fp, k_fn, t_fp, t_fn, value, t_optimal_fp, t_optimal_fn, value_optimal'
        f.write(c + '\n')

    for Vw_fp in Vw_list_fp:
        for Vw_fn in Vw_list_fn:
            k_fp = (-1)*(Vw_fp / Vc)
            k_fn = (-1)*(Vw_fn / Vc)
            t_fp = cost_based_threshold(k_fp)
            t_fn = cost_based_threshold(k_fn)

            value = calculate_value(y_hat_proba_test, y_test, t_fp, Vw_fp, t_fn, Vw_fn, Vc, Vr)
            t_optimal, cost_list = find_optimum_confidence_threshold(y_hat_proba_val, y_val, confT_list, Vw_fp, Vw_fn, Vc, Vr)
            value_optimal = calculate_value(y_hat_proba_test, y_test, t_optimal[0], Vw_fp, t_optimal[1], Vw_fn, Vc, Vr)

            with open(rc_path, 'a') as f:
                res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(Vr, Vc, Vw_fp, Vw_fn, k_fp, k_fn, t_fp, t_fn, value, t_optimal[0], t_optimal[1], value_optimal)
                f.write(res_i)

def plot_confidence_hist(dt, name, res_path, bin, figName):

    ax = dt.hist(column=name, color='#86bf91', bins=bin)

    for x in ax[0]:
     #   x.set_title("Empirical distibution of confidence on " + name + "set")

        # Set x-axis label
        x.set_xlabel("Confidence", labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Number of items", labelpad=20, weight='bold', size=12)

    plt.savefig(figName)


In [None]:
cost_based_analysis(proba_val_1, y_val, proba_test_1, y_test, res_path, resFileName1, Vr, Vc, Vw, t)

In [None]:
cost_based_analysis(proba_val_2, y_val, proba_test_1, y_test, res_path, resFileName1, Vr, Vc, Vw, t)

In [None]:
cost_based_analysis(proba_val_3, y_val, proba_test_3, y_test, res_path, resFileName3, Vr, Vc, Vw, t)

In [None]:
cost_based_analysis(proba_val_4, y_val, proba_test_3, y_test, res_path, resFileName3, Vr, Vc, Vw, t)

In [None]:
proba_test1_pd = pd.read_csv(path + 'logits/logits_validation_1.csv', header=None).to_numpy()
proba_test2_pd = pd.read_csv(path + 'logits/logits_validation_2.csv', header=None).to_numpy()
proba_test3_pd = pd.read_csv(path + 'logits/logits_validation_3.csv', header=None).to_numpy()
proba_test4_pd = pd.read_csv(path + 'logits/logits_validation_4.csv', header=None).to_numpy()

dt_test1 = pd.DataFrame({'val1 ': proba_test1_pd.max(axis=1)})
dt_test2 = pd.DataFrame({'val2 ': proba_test2_pd.max(axis=1)})
dt_test3 = pd.DataFrame({'val3 ': proba_test3_pd.max(axis=1)})
dt_test4 = pd.DataFrame({'val4 ': proba_test4_pd.max(axis=1)})

plot_confidence_hist(dt_test1, 'val1 ', res_path, 25, res_path + "clickbait_1_val_conf.png")
plot_confidence_hist(dt_test2, 'val2 ', res_path, 25, res_path + "clickbait_2_val_conf.png")
plot_confidence_hist(dt_test3, 'val3 ', res_path, 25, res_path + "clickbait_3_val_conf.png")
plot_confidence_hist(dt_test4, 'val4 ', res_path, 25, res_path + "clickbait_4_val_conf.png")