# Dataset characteristics

In this notebook we present the characteristics of the benchmark datasets in terms of type of target (binary or multiclass), proportion of the positive class, number of features and number of instances.

In [5]:
import pandas as pd
import os

In [4]:
os.chdir("..")
root_path = os.getcwd()

In [8]:
path_csv = os.path.join(root_path, 'datasets')
os.chdir(path_csv)

total_name_list = []
target_type_list = []
class1_proportion_list = []
num_rows_list = []
num_columns_list = []

multiclass_name_list = []
class_list = []
class_proportion_list = []

for filename in os.listdir(path_csv):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        target = df['y']
        
        unique_classes = target.nunique()
        if unique_classes == 2:
            target_type = 'binary'
            class1_proportion = target.value_counts(normalize=True).get(1, 0)
        else:
            target_type = 'multiclass'
            class1_proportion = None
            class_proportions = target.value_counts(normalize=True)
            for class_value, proportion in class_proportions.items():
                multiclass_name_list.append(filename)
                class_list.append(class_value)
                class_proportion_list.append(proportion)

        total_name_list.append(filename)
        target_type_list.append(target_type)
        class1_proportion_list.append(class1_proportion)
        num_rows_list.append(df.shape[0])      
        num_columns_list.append(df.shape[1])   

general_df = pd.DataFrame({
    'dataset_name': total_name_list,
    'target_type': target_type_list,
    'class1_proportion': class1_proportion_list,
    'num_rows': num_rows_list,
    'num_columns': num_columns_list
})

multiclass_df = pd.DataFrame({
    'dataset_name': multiclass_name_list,
    'class': class_list,
    'class_proportion': class_proportion_list
})


In [9]:
general_df

Unnamed: 0,dataset_name,target_type,class1_proportion,num_rows,num_columns
0,teaching_assistant_MH.csv,binario,0.509804,102,4
1,cleveland.csv,multiclase,,303,14
2,contraceptive_NL.csv,binario,0.346154,962,10
3,hill_valley_without_noise_traintest.csv,binario,0.50495,1212,101
4,glass0.csv,binario,0.327103,214,10
5,saheart.csv,binario,0.34632,462,10
6,breast-w.csv,binario,0.344778,699,10
7,contraceptive_LS.csv,binario,0.60545,844,10
8,yeast1.csv,binario,0.289084,1484,9
9,ilpd.csv,binario,0.284974,579,11


We are finally not using multiclass dataset since the sampling needs some specific modifications to ensure that every class is correctly represented. Besides, we only have 3 multiclass datasets and we're developing the method for Boosting only for binary targets. Therefore, we delete cleveland, segment and analcatdata_authorship datasets.

In [10]:
multiclass_df

Unnamed: 0,dataset_name,class,class_proportion
0,cleveland.csv,0,0.541254
1,cleveland.csv,1,0.181518
2,cleveland.csv,2,0.118812
3,cleveland.csv,3,0.115512
4,cleveland.csv,4,0.042904
5,segment.csv,5,0.142857
6,segment.csv,2,0.142857
7,segment.csv,6,0.142857
8,segment.csv,0,0.142857
9,segment.csv,3,0.142857


We delete those datasets:

In [11]:
path_csv = os.path.join(root_path, 'datasets')
os.chdir(path_csv)

total_name_list = []
target_type_list = []
class1_proportion_list = []
num_rows_list = []
num_columns_list = []

multiclass_name_list = []
class_list = []
class_proportion_list = []

for filename in os.listdir(path_csv):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        target = df['y']
        
        unique_classes = target.nunique()
        if unique_classes == 2:
            target_type = 'binary'
            class1_proportion = target.value_counts(normalize=True).get(1, 0)
        else:
            target_type = 'multiclass'
            class1_proportion = None

        total_name_list.append(filename)
        target_type_list.append(target_type)
        class1_proportion_list.append(class1_proportion)
        num_rows_list.append(df.shape[0])      
        num_columns_list.append(df.shape[1])   

general_df_binary = pd.DataFrame({
    'dataset_name': total_name_list,
    'target_type': target_type_list,
    'class1_proportion': class1_proportion_list,
    'num_rows': num_rows_list,
    'num_columns': num_columns_list
})



In [12]:
general_df_binary

Unnamed: 0,dataset_name,target_type,class1_proportion,num_rows,num_columns
0,teaching_assistant_MH.csv,binary,0.509804,102,4
1,contraceptive_NL.csv,binary,0.346154,962,10
2,hill_valley_without_noise_traintest.csv,binary,0.50495,1212,101
3,glass0.csv,binary,0.327103,214,10
4,saheart.csv,binary,0.34632,462,10
5,breast-w.csv,binary,0.344778,699,10
6,contraceptive_LS.csv,binary,0.60545,844,10
7,yeast1.csv,binary,0.289084,1484,9
8,ilpd.csv,binary,0.284974,579,11
9,phoneme.csv,binary,0.293486,5404,6
