In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

from scipy.integrate import simpson
from numpy import trapz
from sklearn import metrics

import math
from tqdm.notebook import tqdm

In [2]:
df1_full = pd.read_csv("./data/roc_curve_method1/base_model/data/full_data.csv", index_col=0)
df2_full = pd.read_csv("./data/roc_curve_method2/base_model/data/full_data.csv", index_col=0)

In [3]:
df1_job = pd.read_csv("./data/roc_curve_method1/job_model/data/full_data.csv", index_col=0)
df2_job = pd.read_csv("./data/roc_curve_method2/job_model/data/full_data.csv", index_col=0)

In [4]:
df1_concept = pd.read_csv("./data/roc_curve_method1/concept_model/data/full_data.csv", index_col=0)
df2_concept = pd.read_csv("./data/roc_curve_method2/concept_model/data/full_data.csv", index_col=0)

In [86]:
# generating auc score
# area_trapz = trapz(df1['tpr'], df1['threshold'])
# area_simpson = simpson(df1['tpr'], df1['threshold'])
# print(f"trapz: {area_trapz:.4f}, simpson: {area_simpson:.4f}, diff: {area_trapz - area_simpson}")

trapz: 0.6022, simpson: 0.6022, diff: 1.1102230246251565e-16


In [88]:
print("AUC Score Evaluation:")
print(f"Full Model: method1: {trapz(df1_full['tpr'], df1_full['threshold']):.4f}, method2: {trapz(df2_full['tpr'], df2_full['threshold']):.4f}")
print(f"Job Model: method1: {trapz(df1_job['tpr'], df1_job['threshold']):.4f}, method2: {trapz(df2_job['tpr'], df2_job['threshold']):.4f}")
print(f"Concept Model: method1: {trapz(df1_concept['tpr'], df1_concept['threshold']):.4f}, method2: {trapz(df2_concept['tpr'], df2_concept['threshold']):.4f}")

AUC Score Evaluation:
Full Model: method1: 0.6022, method2: 0.7028
Job Model: method1: 0.6044, method2: 0.7046
Concept Model: method1: 0.6044, method2: 0.7047


In [8]:
# plot ROC curve
df_tmp = df2_job
fig, ax = plt.subplots()
scatter = ax.scatter(df_tmp['fpr'], df_tmp['tpr'], 
                        c=df_tmp['threshold'],   # Use 'threshold' for color
                        cmap='hsv',        # Choose a colormap
                        label='Data Points')

# add labels
cbar = plt.colorbar(scatter, label='Threshold')

# ax.plot(df2['fpr'], df2['tpr'], color="red", linewidth=3, label="ROC Curve")

# generate and display auc score
auc = metrics.auc(df_tmp['fpr'], df_tmp['tpr'])

textstr = f'AUC={auc:.2f}'
# props = dict(boxstyle='round', facecolor='lightgrey', alpha=0.5)
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(0.73, 0.1, textstr, transform=ax.transAxes, fontsize=12,
        verticalalignment='top', bbox=props)
# plt.text(0.72, 0, f'AUC: {auc:.2f}', fontsize = 14)

plt.xlabel("FP Rate")
plt.ylabel("TP Rate")

plt.savefig(
     './data/roc_curve_method2/job_model/full_data.png',
     dpi=300,
)
plt.close()

# plt.show()

In [13]:
# get all files in folder
files = [os.path.normpath(f) for f in glob.glob(os.path.join("./data/roc_curve_method1/job_model/data", "*.csv"))]

for idx, file in enumerate(files):
    if os.path.basename(file) == 'full_data.csv':
        files.pop(idx)

# generate plot figure
number_of_subplots = math.ceil(len(files) / 5)
fig, axes = plt.subplots(number_of_subplots, 5, figsize=(24, 8), gridspec_kw={'hspace': 0.4, 'wspace': 0.3})
# Flatten the 2D array of axes for easier indexing
axes = axes.flatten()

with tqdm(total=len(files)) as pbar:
    for i in range(len(files)):
        df = pd.read_csv(files[i], index_col=0)
        category = os.path.basename(files[i]).split('.')[0]
        fpr_arr = df['fpr'].to_list()
        tpr_arr = df['tpr'].to_list()
        threshold_settings = df['threshold']

        index = i*len(threshold_settings)
            
        # plot subplot            
        scatter = axes[i].scatter(fpr_arr, tpr_arr, 
                                    c=threshold_settings,
                                    cmap='hsv',
                                    label='Data Points')

        # add AUC score
        auc = metrics.auc(fpr_arr, tpr_arr)
        
        textstr = f'AUC={auc:.2f}'
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        axes[i].text(0.67, 0.12, textstr, transform=axes[i].transAxes, fontsize=12,
                verticalalignment='top', bbox=props)

        # set labels
        axes[i].text(0.5, -0.25, f'{chr(97 + i)}) {category}', fontsize=12, horizontalalignment='center', verticalalignment='center', transform=axes[i].transAxes)
        axes[i].set_xlabel('FP Rate')
        axes[i].set_ylabel('TP Rate')
         
        pbar.update(1)

fig.subplots_adjust(right=0.87)
cbar_ax = fig.add_axes([0.88, 0.12, 0.01, 0.76])
fig.colorbar(scatter, cax=cbar_ax)

# plt.show()
plt.savefig(
    './data/roc_curve_method1/job_model/all_categories.png',
    dpi=400,
)
plt.close()

  0%|          | 0/10 [00:00<?, ?it/s]