In [36]:
import json
import openml
from scipy.stats import ttest_rel
import lightgbm as lgb
from src import DataBinner
import numpy as np
num_datasets = 18
num_models = 3
num_binning_methods = 3
num_seeds = 20

benchmark_suite = openml.study.get_suite(336) #337 for classification

def sig_func(p_val):
    if p_val < 0.01:
        return '***'
    elif p_val < 0.05:
        return '**'
    elif p_val < 0.1:
        return '*'
    else:
        return ''
    
for idx in np.arange(11, 15):
    if idx == 3:
        continue
    
    task_id = benchmark_suite.tasks[idx]
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    name = dataset.name
    name = name.replace("_", "\_")
    obs = dataset.qualities['NumberOfInstances']
    features = dataset.qualities['NumberOfFeatures']
    
    with open(f"../regression_results/regression_results_{idx}.json", "r") as f:
        results = json.load(f)
    
    minibatch_dict = results['minibatch_kmeans']
    linspace_dict = results['linspace']
    quantile_dict = results['quantile']
    
    improvement_kmeans = {}
    improvement_linspace = {}
    
    significance_kmeans = {}
    significance_linspace = {}
    
    for model in ['LGBM', 'CAT', 'XGB']:
        kmeans_results = minibatch_dict[model]
        linspace_results = linspace_dict[model]
        quantile_results = quantile_dict[model]
        
        kmeans_mean = np.mean(kmeans_results)
        linspace_mean = np.mean(linspace_results)
        quantile_mean = np.mean(quantile_results)
        
        improvement_kmeans[model] = 100 * (quantile_mean - kmeans_mean) / quantile_mean
        improvement_linspace[model] = 100 * (quantile_mean - linspace_mean) / quantile_mean
        
        t_stat, p_val = ttest_rel(kmeans_results, quantile_results)
        significance_kmeans[model] = sig_func(p_val)
        
        t_stat, p_val = ttest_rel(linspace_results, quantile_results)
        significance_linspace[model] = sig_func(p_val)
        
        """print(f"KMeans Relative Improvement: {100 * (quantile_mean - kmeans_mean) / quantile_mean:.2f}%")
        print(f"Linspace Relative Improvement: {100 * (quantile_mean - linspace_mean) / quantile_mean:.2f}%")
        
        t_stat, p_val = ttest_rel(kmeans_results, quantile_results)
        print(f"P-value for KMeans vs. Quantile: {p_val:.4f}. SIGNIFICANT? {p_val < 0.05}")
        
        t_stat, p_val = ttest_rel(linspace_results, quantile_results)
        print(f"P-value for Linspace vs. Quantile: {p_val:.4f}")"""

    #print(f"{name} & {improvement_kmeans['LGBM']:.2f} & {improvement_kmeans['XGB']:.2f} & {improvement_kmeans['CAT']:.2f} & {improvement_linspace['LGBM']:.2f} & {improvement_linspace['XGB']:.2f} & {improvement_linspace['CAT']:.2f} \\\\")
    print(f"{name} & {significance_kmeans['LGBM']} & {significance_kmeans['XGB']} & {significance_kmeans['CAT']} & {significance_linspace['LGBM']} & {significance_linspace['XGB']} & {significance_linspace['CAT']} \\\\")   
    
    

house\_sales & *** & *** & *** & *** &  &  \\
sulfur &  &  &  &  &  &  \\
medical\_charges & *** & *** & *** & *** & *** & *** \\
MiamiHousing2016 &  &  & * &  &  &  \\
