# AC - CV vs Overfit

In [1]:
import numpy as np
random_state = 42
np.random.seed(random_state)
import os, glob
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from quantification.metrics import binary_kl_divergence, absolute_error
from quantification.utils.validation import create_bags_with_multiple_prevalence

import seaborn as sns
import matplotlib.pyplot as plt

## Dataset

In [2]:
datasets_dir = "datasets"
dataset_files = [file for file in glob.glob(os.path.join(datasets_dir, "*.csv")) if "k9" not in file]
dataset_names = [os.path.split(name)[-1][:-4] for name in dataset_files]
print("There are a total of {} datasets.".format(len(dataset_names)))

There are a total of 43 datasets.


In [3]:
n_datasets = len(dataset_names)

columns=['dataset', 'method', 'truth', 'predictions', 'kld', 'mae', 'tpr', 'fpr']
errors_df = pd.DataFrame(columns=columns)

## Utils

#### Standard scale data

In [4]:
from sklearn.preprocessing import StandardScaler

def normalize(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


#### Load data

In [5]:
def load_data(dfile):
    df = pd.read_csv(dfile, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values.astype(np.int)
    if -1 in np.unique(y):
        y[y == -1] = 0
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=random_state)
    
    X_train, X_test = normalize(X_train, X_test)
    
    return X_train, X_test, y_train, y_test 

## AC - 50 folds

In [6]:
from quantification.cc import AC

ac = AC(estimator_class=RandomForestClassifier(n_estimators=50, max_depth=5))
for dname, dfile in tqdm(zip(dataset_names, dataset_files), total=n_datasets):
    
    X_train, X_test, y_train, y_test = load_data(dfile)
    
    ac.fit(X_train, y_train, cv=50)
    for X_test_, y_test_, prev_true, in create_bags_with_multiple_prevalence(X_test, y_test, 100):
        prev_true = prev_true[1]
        prev_pred = ac.predict(X_test_)[1]

        kld = binary_kl_divergence(prev_true, prev_pred)
        mae = absolute_error(prev_true, prev_pred)
 
        errors_df = errors_df.append(pd.DataFrame([[dname, 'AC50folds', prev_true, prev_pred, kld, mae, ac.tpr_[1], ac.fpr_[1]]], columns=columns))

  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
100%|██████████| 43/43 [05:22<00:00,  7.51s/it]


## AC - overfit

In [7]:
from quantification.cc import AC

ac = AC(estimator_class=RandomForestClassifier(n_estimators=50, max_depth=5))
for dname, dfile in tqdm(zip(dataset_names, dataset_files), total=n_datasets):
    
    X_train, X_test, y_train, y_test = load_data(dfile)
    
    ac.fit(X_train, y_train, cv=1)
    for X_test_, y_test_, prev_true, in create_bags_with_multiple_prevalence(X_test, y_test, 100):
        prev_true = prev_true[1]
        prev_pred = ac.predict(X_test_)[1]

        kld = binary_kl_divergence(prev_true, prev_pred)
        mae = absolute_error(prev_true, prev_pred)
 
        errors_df = errors_df.append(pd.DataFrame([[dname, 'ACof', prev_true, prev_pred, kld, mae, ac.tpr_[1], ac.fpr_[1]]], columns=columns))

  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
100%|██████████| 43/43 [01:44<00:00,  2.43s/it]


In [8]:
errors_df.reset_index(inplace=True, drop=True)

In [9]:
errors_df.groupby(['method'])[['kld', 'mae']].agg(['mean', 'median'])

Unnamed: 0_level_0,kld,kld,mae,mae
Unnamed: 0_level_1,mean,median,mean,median
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AC50folds,1.67501,0.00624,0.10507,0.04202
ACof,1.20477,0.0104,0.12417,0.05469


In [10]:
from bokeh.plotting import figure, output_notebook, show
import seaborn as sns
import itertools
output_notebook()

palette = sns.palettes.color_palette('colorblind', n_datasets)
colors = itertools.cycle(palette.as_hex())
p = figure(plot_width=800, plot_height=1400)

for dname, color in zip(dataset_names, colors):
    df = errors_df[np.logical_and(errors_df.dataset==dname, errors_df.method=="AC50folds")]
    p.scatter(df['truth'], df['mae'], color=color, legend=dname)

p.legend.location = "top_left"
p.legend.click_policy="hide"


In [11]:
show(p)

In [12]:
from bokeh.plotting import figure, output_notebook, show
import seaborn as sns
import itertools
output_notebook()

palette = sns.palettes.color_palette('colorblind', n_datasets)
colors = itertools.cycle(palette.as_hex())
p = figure(plot_width=800, plot_height=1400)

for dname, color in zip(dataset_names, colors):
    df = errors_df[np.logical_and(errors_df.dataset==dname, errors_df.method=="ACof")]
    p.scatter(df['truth'], df['mae'], color=color, legend=dname)

p.legend.location = "top_left"
p.legend.click_policy="hide"



In [13]:
show(p)

In [14]:
errors_df[np.logical_and(errors_df.method=="ACof", errors_df.dataset=="semeion.8")]

Unnamed: 0,dataset,method,truth,predictions,kld,mae,tpr,fpr
4900,semeion.8,ACof,0.15193,0.04759,0.11247,0.10434,0.82927,0.00000
4901,semeion.8,ACof,0.77590,0.21089,1.05121,0.56501,0.82927,0.00000
4902,semeion.8,ACof,0.60356,0.17845,0.64429,0.42511,0.82927,0.00000
4903,semeion.8,ACof,0.39198,0.10274,0.41585,0.28923,0.82927,0.00000
4904,semeion.8,ACof,0.34327,0.10058,0.30998,0.24269,0.82927,0.00000
4905,semeion.8,ACof,0.48888,0.14925,0.46113,0.33963,0.82927,0.00000
4906,semeion.8,ACof,0.48418,0.16980,0.37779,0.31438,0.82927,0.00000
4907,semeion.8,ACof,0.16354,0.05083,0.12316,0.11271,0.82927,0.00000
4908,semeion.8,ACof,0.24627,0.05948,0.26404,0.18679,0.82927,0.00000
4909,semeion.8,ACof,0.35989,0.11140,0.30597,0.24849,0.82927,0.00000


In [16]:
errors_df.groupby(["method", "dataset"])[["tpr", "fpr", "mae"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,tpr,fpr,mae
method,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AC50folds,acute.a,1.00000,0.00000,0.00365
AC50folds,acute.b,0.94118,0.00000,0.03056
AC50folds,balance.1,0.87000,0.09000,0.01396
AC50folds,balance.2,0.00000,0.00000,0.57980
AC50folds,balance.3,0.83000,0.10000,0.02071
AC50folds,breast-cancer,0.97000,0.05000,0.00739
AC50folds,cmc.1,0.48700,0.19300,0.08218
AC50folds,cmc.2,0.14000,0.03571,0.25045
AC50folds,cmc.3,0.13500,0.08200,0.18855
AC50folds,coil,0.00000,0.00000,0.50567
