In [8]:
import os
from scipy.io import arff
import pandas as pd
import numpy as np

def c_score(series):
    value_counts = series.value_counts(normalize=True)
    entropy = -np.sum(value_counts * np.log(value_counts))
    max_entropy = np.log(len(value_counts))
    return 0.0 if max_entropy == 0 else round(1 - entropy / max_entropy, 3)

folder_path = '../data/raw'
rows = []

for filename in os.listdir(folder_path):
    if filename.endswith('.arff'):
        file_path = os.path.join(folder_path, filename)
        try:
            data, meta = arff.loadarff(file_path)
            df = pd.DataFrame(data)
            
            df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

            name = os.path.splitext(filename)[0]
            n_samples = df.shape[0]
            target_col = df.columns[-1]
            X = df.drop(columns=[target_col])
            y = df[target_col]

            # Infer data types
            numerical = sum(pd.api.types.is_numeric_dtype(X[col]) for col in X.columns)
            categorical = X.shape[1] - numerical
            n_features = X.shape[1]
            n_classes = y.nunique()
            # F-S ratio
            fs_ratio = round(100 * n_features / n_samples, 3)

            # C-score: mean of absolute Pearson correlations of all numerical feature pairs
            if numerical >= 2:
                corr_matrix = df.select_dtypes(include=[np.number]).corr().abs()
                upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
                c_score = round(upper_tri.stack().mean(), 3)
            else:
                c_score = 0.0

            rows.append([name, n_samples, numerical, categorical, n_features, n_classes, fs_ratio, c_score])

        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Create and print the table
columns = ['Name', 'Samples', 'Numerical', 'Categorical', 'Features', 'Classes', 'F-S ratio', 'C-score']
table_df = pd.DataFrame(rows, columns=columns)
table_df = table_df.sort_values(by='Samples')
print(table_df.to_string(index=False))


                    Name  Samples  Numerical  Categorical  Features  Classes  F-S ratio  C-score
                     kc2      522         21            0        21        2      4.023    0.775
               phpJem85A      540         20            0        20        2      3.704    0.012
               phpAmSP4g      569         30            0        30        2      5.272    0.395
               phpOJxGL9      583          9            1        10        2      1.715    0.208
dataset_11_balance-scale      625          4            0         4        3      0.640    0.000
               php0iVrYT      748          4            0         4        2      0.535    0.466
     dataset_37_diabetes      768          8            0         8        2      1.042    0.172
        analcatdata_dmft      797          0            4         4        6      0.502    0.000
  analcatdata_authorship      841         70            0        70        4      8.323    0.148
                     pc1     1