In [2]:
import os
import pandas as pd
from glob import glob
from utils import string_supporting_mean

In [3]:
def get_datasets_data(problem_type: str) -> pd.DataFrame:
    data = []
    for path in glob(f'../../datasets/{problem_type}/*'):
        dataset_name: str = os.path.basename(path)
        df = pd.read_parquet(f'{path}/{dataset_name}.parquet')
        columns = df.columns.tolist()
        if 'survival_time' in columns:
            columns.remove('survival_time')
            columns.remove('survival_status')
        else:
            columns.remove('class')
        df = df[columns]
        data.append({
            'problem_type': problem_type,
            'dataset': dataset_name,
            'examples count': df.shape[0],
            'attributes count': df.shape[1],
            'numerical attributes count': df.select_dtypes('number').columns.shape[0],
            'nominal attributes count': df.shape[1] - df.select_dtypes('number').columns.shape[0],
            'missing values count': df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100,
        })
    return pd.DataFrame(data)

datasets_stats = pd.concat([
    get_datasets_data('classification'),
    get_datasets_data('regression'),
    get_datasets_data('survival'),
])

In [29]:
datasets_stats[(datasets_stats['numerical attributes count'] == 0) & (datasets_stats['nominal attributes count'] > 0)].shape

(12, 7)

In [32]:
datasets_stats[(datasets_stats['numerical attributes count'] == 0) & (datasets_stats['nominal attributes count'] > 0) & (datasets_stats['problem_type'] == 'regression')].shape

(1, 7)

In [19]:
datasets_stats_avg = datasets_stats.groupby('problem_type').agg(
    string_supporting_mean).drop(columns=['dataset']).round(2)

datasets_stats_avg.to_csv('../datasets_stats_avg.csv', index=True)
datasets_stats_avg['datasets count'] = None

for problem_type in datasets_stats['problem_type'].unique():
    datasets_stats_avg.loc[
        problem_type,
        'datasets count'
    ] = datasets_stats[datasets_stats['problem_type'] == problem_type].shape[0]

datasets_stats_avg

Unnamed: 0_level_0,examples count,attributes count,numerical attributes count,nominal attributes count,missing values count,datasets count
problem_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
classification,1131.27,16.33,6.47,9.87,1.2,30
regression,189.07,9.5,7.33,2.17,0.06,30
survival,723.25,16.38,13.0,3.38,5.0,16


In [20]:
data = []

for problem in ['classification', 'regression', 'survival']:
    stats = datasets_stats[datasets_stats['problem_type'] == problem]
    data.append({
        'Problem type': problem,
        'Datasets count': stats['dataset'].unique().shape[0],
        'Min attributes count': stats['attributes count'].min(),
        'Mean attributes count': stats['attributes count'].mean(),
        'Max attributes count': stats['attributes count'].max(),
        'Min rows count': stats['examples count'].min(),
        'Mean rows count': stats['examples count'].mean(),
        'Max rows count': stats['examples count'].max(),
        'Datasets with missing values': stats[stats['missing values count'] > 0].shape[0]
    })

df= pd.DataFrame(data)
df.to_csv('../adatasets_stats_avg.csv', index=False)

In [22]:
df

Unnamed: 0,Problem type,Datasets count,Min attributes count,Mean attributes count,Max attributes count,Min rows count,Mean rows count,Max rows count,Datasets with missing values
0,classification,30,4,16.333333,61,101,1131.266667,12960,10
1,regression,30,3,9.5,28,27,189.066667,625,3
2,survival,16,6,16.375,57,187,723.25,3154,7


In [23]:
for problem in ['classification', 'regression', 'survival']:
    df.loc[problem, 'Datasets with missing values'] = datasets_stats[
        (datasets_stats['problem_type'] == problem_type) & 
        (datasets_stats['missing values count'] > 0) 
    ].shape[0]

In [27]:
datasets_stats[
        (datasets_stats['problem_type'] == 'survival') & 
        (datasets_stats['missing values count'] > 0) 
    ].shape[0]

7