In [None]:
!pip install sklearn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#cost-based parameters
Vr = 0.0
Vc = 1.0
Vw_g = list(np.arange(0, -10.1, -0.1))
t_g = list(np.arange(0, 1.02, 0.01))

#specify the names for the models and the dataset
modelName_MLP4 = 'MLP4'
modelName_MLP1 = 'MLP1'
modelName_log = 'LogReg'
datasetName = 'yelp'

In [None]:
#path to your dataset
path = '.../yelp/data/'
#path to where you want to store results
res_path = '.../yelp/res/'
resFileNameMLP1 = "yelp-MLP1_vectorized"
resFileNameMLP4 = "yelp-MLP4_vectorized"
resFileNameLog = "yelp-log_vectorized"

proba_val_pd_MLP1 = pd.read_csv(path + 'logits_val_mlp1.csv', header=None)
proba_val_MLP1 = proba_val_pd_MLP1.to_numpy()
proba_test_pd_MLP1 = pd.read_csv(path + 'logits_test_mlp1.csv', header=None)
proba_test_MLP1 = proba_test_pd_MLP1.to_numpy()

proba_val_pd_MLP4 = pd.read_csv(path + 'logits_val.csv', header=None)
proba_val_MLP4 = proba_val_pd_MLP4.to_numpy()
proba_test_pd_MLP4 = pd.read_csv(path + 'logits_test.csv', header=None)
proba_test_MLP4 = proba_test_pd_MLP4.to_numpy()

proba_val_pd_log = pd.read_csv(path + 'logits_val_log.csv', header=None)
proba_val_log = proba_val_pd_log.to_numpy()
proba_test_pd_log = pd.read_csv(path + 'logits_test_log.csv', header=None)
proba_test_log = proba_test_pd_log.to_numpy()

y_val_pd = pd.read_csv(path + 'y_val.csv', header=None)
y_val = y_val_pd[0].to_numpy().astype(int)
y_test_pd = pd.read_csv(path + 'y_test.csv', header=None)
y_test = y_test_pd[0].to_numpy().astype(int)

In [None]:
def cost_based_threshold(k):
    t = (k)/(k+1)
    return t

def calculate_value(y_hat_proba, y, t, Vr, Vc, Vw):

    y_pred = np.array([np.where(l == np.amax(l))[0][0] if (np.amax(l) >= t) else -1 for l in y_hat_proba])

    # now lets compute the actual value of each prediction
    
    value_vector = np.full(y_pred.shape[0], Vc)

    value_vector[y_pred != y] = Vw
    
    #loss due to asking humans
    value_vector[y_pred == -1] = Vr

    value = np.sum(value_vector) / len(y)
    return value

def find_optimum_confidence_threshold(y_hat_proba, y, t_list, Vr, Vc, Vw):

    cost_list = {}

    for t in t_list:
        # here we define K = fn_c_norm, change it based on task. 
        value = calculate_value(y_hat_proba, y, t, Vr, Vc, Vw)
        cost_list["{}".format(t)] = value
    # find t values with maximum value
    maxValue = max(cost_list.values())
    optTList = [float(k) for k, v in cost_list.items() if v == maxValue]
    # pick the one with the lowest confidence
    optimumT = min(optTList)

    return optimumT, cost_list

#cost based calibration analysis
def cost_based_analysis(y_hat_proba_val, y_val, y_hat_proba_test, y_test, res_path, logfile_name, Vr, Vc, Vw_list, confT_list):

    # create log file
    rc_path = res_path + logfile_name + "_costBased_test.csv"
    with open(rc_path, 'w') as f:
        c = 'Vr, Vc, Vw, k, t, value, t_optimal, value_optimal'
        f.write(c + '\n')

    for Vw in Vw_list:
        #  Vr_norm, Vc_norm, Vw_norm = normalize_value(Vr, Vc, Vw)
        #print("ch: {}, V: {}, fp_c: {} fn_c: {}".format(ch_norm, V_norm, fp_c_norm, fn_c_norm))
        k = (-1)*(Vw / Vc)
        t = cost_based_threshold(k)
        value = calculate_value(y_hat_proba_test, y_test, t, Vr, Vc, Vw)

        t_optimal, cost_list = find_optimum_confidence_threshold(y_hat_proba_val, y_val, confT_list, Vr, Vc, Vw)
        value_optimal = calculate_value(y_hat_proba_test, y_test, t_optimal, Vr, Vc, Vw)

        with open(rc_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}\n'.format(Vr, Vc, Vw, k, t, value, t_optimal, value_optimal)
            f.write(res_i)
  

def plot_confidence_hist(dt, name, res_path, bin, figName):

    ax = dt.hist(column=name, color='#86bf91', bins=bin)

    for x in ax[0]:
     #   x.set_title("Empirical distibution of confidence on " + name + "set")

        # Set x-axis label
        x.set_xlabel("Confidence", labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Number of items", labelpad=20, weight='bold', size=12)

    plt.savefig(figName)

In [None]:
logfile_name = "{}_{}_vectorized".format(datasetName,modelName_MLP1) 
cost_based_analysis(proba_val_MLP1, y_val, proba_test_MLP1, y_test, res_path, logfile_name, Vr, Vc, Vw_g, t_g)

In [None]:
logfile_name = "{}_{}_vectorized".format(datasetName,modelName_MLP4) 
cost_based_analysis(proba_val_MLP4, y_val, proba_test_MLP4, y_test, res_path, logfile_name, Vr, Vc, Vw_g, t_g)

In [None]:
logfile_name = "{}_{}_vectorized".format(datasetName,modelName_log) 
cost_based_analysis(proba_val_log, y_val, proba_test_log, y_test, res_path, logfile_name, Vr, Vc, Vw_g, t_g)

In [None]:
dt_val = pd.DataFrame({'val ': proba_val_MLP4.max(axis=1)})
dt_test = pd.DataFrame({'test ': proba_test_MLP4.max(axis=1)})

plot_confidence_hist(dt_val, 'val ', res_path, 20, res_path + "yelp_MLP4_val_conf.png")
plot_confidence_hist(dt_test, 'test ', res_path, 20, res_path + "yelp_MLP4_test_conf.png")