# Is AC just a 2-bin HDY?

This notebook tries to show how AC is just a simplification of HDy with 2 bins.

## Preparation

In [1]:
import glob
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

random_state=42
np.random.seed(random_state)

In [2]:
datasets_dir = "datasets"
dataset_files = [file for file in glob.glob(os.path.join(datasets_dir, "*.csv")) if "k9" not in file]
dataset_names = [os.path.split(name)[-1][:-4] for name in dataset_files]
n_datasets = len(dataset_names)
print("There are a total of {} datasets.".format(len(dataset_names)))

There are a total of 43 datasets.


## Utils

#### Standard scale data

In [3]:
from sklearn.preprocessing import StandardScaler

def normalize(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

#### Load data

In [4]:
from sklearn.model_selection import train_test_split

def load_data(dfile):
    df = pd.read_csv(dfile, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values.astype(np.int)
    if -1 in np.unique(y):
        y[y == -1] = 0
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=random_state)
    
    X_train, X_test = normalize(X_train, X_test)
    
    return X_train, X_test, y_train, y_test 

In [5]:
preds_df = pd.DataFrame(dict(dataset=dataset_names, truth=np.zeros(n_datasets), hdy=np.zeros(n_datasets), ac=np.zeros(n_datasets)))
preds_df = preds_df.set_index('dataset')

## Algorithms

In [6]:
from quantification.cc import BaseCC
from sklearn.linear_model import LogisticRegression
for dname, dfile in tqdm(zip(dataset_names, dataset_files), total=n_datasets):
    
    cc = BaseCC(estimator_class=LogisticRegression(random_state=random_state), b=2)
    X_train, X_test, y_train, y_test = load_data(dfile)
    
    
    cc.fit(X_train, y_train, cv=1)
    pred_hdy = cc.predict(X_test, method="hdy")[1]
    pred_ac = cc.predict(X_test, method="ac")[1]
    
    prev_true = np.unique(y_test, return_counts=True)[1][1] / len(X_test)
    
    preds_df.loc[dname].hdy = pred_hdy
    preds_df.loc[dname].ac = pred_ac
    preds_df.loc[dname].truth = prev_true

  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
  adjusted = (relative_freq - self.fpr_[cls]) / float(self.tpr_[cls] - self.fpr_[cls])
100%|██████████| 43/43 [00:10<00:00,  4.09it/s]


In [7]:
preds_df

Unnamed: 0_level_0,ac,hdy,truth
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mammographic,0.542819,0.542772,0.480207
ionosphere,0.298433,0.29844,0.337398
german,0.736662,0.736685,0.717143
cmc.1,0.34017,0.340187,0.414729
normwine.1,0.376,0.37602,0.328
balance.3,0.490582,0.490528,0.452055
semeion.8,0.080645,0.080637,0.102151
spambase,0.403776,0.403788,0.397082
wine-quality-red,0.447185,0.447211,0.542857
acute.a,0.511905,0.511862,0.511905
