In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import category_encoders as ce
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
import keel_ds
from collections import Counter

# Load Data

In [2]:
k = keel_ds.list_data()
imbalanced_datasets_names = k[-1] # Imbalanced datasets

In [3]:
# Take a look for each dataset
imbalanced_datasets_proper = []
dataset_details = []

for dataset in imbalanced_datasets_names:
    print(dataset)
    missing_datasets = ['vowel.npz', 'breast.npz', 'housevotes.npz', 'bupa.npz', 'zoo-3.npz', 'abalone9-18.npz', 'ecoli-0-3-4_vs_5.npz', 'glass4', 'lymphography-normal-fibrosis.npz', 'ecoli-0-1-3-7_vs_2-6.npz', 'shuttle-c2-vs-c4.npz', 'winequality-white-9_vs_4.npz'] # Wrong dataset name in imbalance set (error in library)
    
    if dataset in missing_datasets:
      continue

    dataset = dataset.replace('.npz', '')
    imbalanced_datasets_proper.append(dataset)
    df = keel_ds.load_data(dataset, imbalanced=True, raw=True)

    # Get the number of samples and features
    n_samples = df.shape[0]
    n_features = df.shape[1] - 1 
    
    # imbalance ratio
    class_counts = Counter(df.iloc[:, -1])
    print(class_counts)
    majority_class = max(class_counts.values())
    minority_class = min(class_counts.values())
    imbalance_ratio =  minority_class/majority_class
    
    print(f"Class counts: {class_counts}")
    print(f"Imbalance ratio: {imbalance_ratio}")
    
    dataset_details.append({
       'Dataset': dataset,
       'Samples': n_samples,
       'Features': n_features,
       'Imbalance Ratio': imbalance_ratio
       })

details_df = pd.DataFrame(dataset_details)

yeast3.npz
Counter({' negative': 1321, ' positive': 163})
Class counts: Counter({' negative': 1321, ' positive': 163})
Imbalance ratio: 0.12339137017411052
yeast-0-2-5-6_vs_3-7-8-9.npz
Counter({'negative': 905, 'positive': 99})
Class counts: Counter({'negative': 905, 'positive': 99})
Imbalance ratio: 0.10939226519337017
kr-vs-k-one_vs_fifteen.npz
Counter({'negative': 2166, 'positive': 78})
Class counts: Counter({'negative': 2166, 'positive': 78})
Imbalance ratio: 0.036011080332409975
page-blocks0.npz
Counter({' negative': 4913, ' positive': 559})
Class counts: Counter({' negative': 4913, ' positive': 559})
Imbalance ratio: 0.11377976796254834
glass-0-1-5_vs_2.npz
Counter({'negative': 155, 'positive': 17})
Class counts: Counter({'negative': 155, 'positive': 17})
Imbalance ratio: 0.10967741935483871
kr-vs-k-zero-one_vs_draw.npz
Counter({'negative': 2796, 'positive': 105})
Class counts: Counter({'negative': 2796, 'positive': 105})
Imbalance ratio: 0.03755364806866953
led7digit-0-2-4-5-6-7

In [4]:
details_df

Unnamed: 0,Dataset,Samples,Features,Imbalance Ratio
0,yeast3,1484,8,0.123391
1,yeast-0-2-5-6_vs_3-7-8-9,1004,8,0.109392
2,kr-vs-k-one_vs_fifteen,2244,6,0.036011
3,page-blocks0,5472,10,0.113780
4,glass-0-1-5_vs_2,172,9,0.109677
...,...,...,...,...
88,yeast-1-4-5-8_vs_7,693,8,0.045249
89,ecoli-0-1_vs_2-3-5,244,7,0.109091
90,abalone19,4174,8,0.007726
91,ecoli-0-1-4-7_vs_5-6,332,6,0.081433


In [None]:
# Save data
file_name = "datasets_details.csv"
details_df.to_csv(file_name, index=False)