In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import glob
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
logger = logging.getLogger()

In [2]:
def load_data(
    train_file_pattern: str, test_file_pattern: str
) -> tuple[list[pd.DataFrame], list[pd.DataFrame]]:
    """
    Load train and test data from ARFF files matching the given patterns.

    This function reads ARFF files for both training and test datasets,
    converts them to pandas DataFrames, and returns them as lists.

    Parameters:
    -----------
    train_file_pattern : str
        Glob pattern to match training data files.
    test_file_pattern : str
        Glob pattern to match test data files.

    Returns:
    --------
    tuple[list[pd.DataFrame], list[pd.DataFrame]]
        A tuple containing two lists:
        1. List of DataFrames for training data.
        2. List of DataFrames for test data.

    Note:
    -----
    The function uses scipy.io.arff.loadarff to read ARFF files and
    converts the data to pandas DataFrames. Files are sorted to ensure
    consistent ordering across runs.
    """
    train_dfs = []
    test_dfs = []
    for is_test, file_pattern in [
        (False, train_file_pattern),
        (True, test_file_pattern),
    ]:
        for file in sorted(glob.glob(file_pattern)):
            raw_data, meta = loadarff(file)
            logger.info(f"Loading {file}")
            df = pd.DataFrame(raw_data, columns=meta.names())
            if is_test:
                test_dfs.append(df)
            else:
                train_dfs.append(df)
    return train_dfs, test_dfs

In [3]:
train_dfs, test_dfs = load_data(
    "datasetsCBR/mushroom/mushroom.fold.*.train.arff",
    "datasetsCBR/mushroom/mushroom.fold.*.test.arff",
)

# Display the number of train and test dataframes
print(f"\nNumber of training dataframes: {len(train_dfs)}")
print(f"Number of test dataframes: {len(test_dfs)}")
print()
train_dfs[0].head()

INFO - Loading datasetsCBR/mushroom/mushroom.fold.000000.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000001.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000002.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000003.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000004.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000005.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000006.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000007.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000008.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000009.train.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000000.test.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000001.test.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000002.test.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.000003.test.arff
INFO - Loading datasetsCBR/mushroom/mushroom.fold.00


Number of training dataframes: 10
Number of test dataframes: 10



Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,b'x',b's',b'w',b't',b'l',b'f',b'c',b'b',b'k',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b'n',b'm',b'e'
1,b'k',b'y',b'e',b'f',b's',b'f',b'c',b'n',b'b',b't',...,b'p',b'w',b'p',b'w',b'o',b'e',b'w',b'v',b'd',b'p'
2,b'x',b'y',b'y',b't',b'a',b'f',b'c',b'b',b'n',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b's',b'm',b'e'
3,b'x',b'y',b'w',b't',b'p',b'f',b'c',b'n',b'k',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b'v',b'u',b'p'
4,b'x',b'y',b'n',b'f',b'f',b'f',b'c',b'n',b'b',b't',...,b'w',b'p',b'p',b'w',b'o',b'e',b'w',b'v',b'l',b'p'


In [4]:
for df in train_dfs + test_dfs:
    for col in df.columns:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])

train_dfs[0].head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,5,2,8,1,3,1,0,0,4,0,...,7,7,0,2,1,4,2,2,3,0
1,3,3,2,0,7,1,0,1,0,1,...,6,7,0,2,1,0,7,4,0,1
2,5,3,9,1,0,1,0,0,5,0,...,7,7,0,2,1,4,2,3,3,0
3,5,3,8,1,6,1,0,1,4,0,...,7,7,0,2,1,4,2,4,5,1
4,5,3,4,0,2,1,0,1,0,1,...,7,6,0,2,1,0,7,4,2,1


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

results = []

for train_df, test_df in zip(train_dfs, test_dfs):
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(train_df.drop("class", axis=1), train_df["class"])
    preds = decision_tree.predict(test_df.drop("class", axis=1))
    results.append(accuracy_score(test_df["class"], preds))

print(f"Mean accuracy: {np.mean(results)}")

Mean accuracy: 1.0


In [6]:
train_dfs, test_dfs = load_data(
    "datasetsCBR/hepatitis/hepatitis.fold.*.train.arff",
    "datasetsCBR/hepatitis/hepatitis.fold.*.test.arff",
)

print(f"\nNumber of training dataframes: {len(train_dfs)}")
print(f"Number of test dataframes: {len(test_dfs)}")
print()
train_dfs[0].head()

INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000000.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000001.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000002.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000003.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000004.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000005.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000006.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000007.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000008.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000009.train.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000000.test.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000001.test.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000002.test.arff
INFO - Loading datasetsCBR/hepatitis/hepatitis.fold.000003.test.arff
INFO - Loading datasetsC


Number of training dataframes: 10
Number of test dataframes: 10



Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,38.0,b'female',b'no',b'no',b'yes',b'yes',b'yes',b'yes',b'yes',b'no',b'yes',b'yes',b'yes',1.2,118.0,16.0,2.8,,b'yes',b'DIE'
1,54.0,b'female',b'no',b'no',b'yes',b'yes',b'no',b'yes',b'no',b'no',b'no',b'yes',b'no',1.2,85.0,92.0,3.1,66.0,b'yes',b'LIVE'
2,58.0,b'male',b'yes',b'no',b'yes',b'no',b'no',b'yes',b'yes',b'no',b'yes',b'no',b'no',1.4,175.0,55.0,2.7,36.0,b'no',b'LIVE'
3,23.0,b'female',b'yes',b'no',b'no',b'no',b'no',b'?',b'?',b'?',b'?',b'?',b'?',4.6,56.0,16.0,4.6,,b'no',b'LIVE'
4,31.0,b'female',b'yes',b'no',b'no',b'no',b'no',b'yes',b'no',b'no',b'no',b'no',b'no',1.0,85.0,20.0,4.0,100.0,b'no',b'LIVE'


In [7]:
pd.concat(train_dfs).isnull().sum()

AGE                  0
SEX                  0
STEROID              0
ANTIVIRALS           0
FATIGUE              0
MALAISE              0
ANOREXIA             0
LIVER_BIG            0
LIVER_FIRM           0
SPLEEN_PALPABLE      0
SPIDERS              0
ASCITES              0
VARICES              0
BILIRUBIN           54
ALK_PHOSPHATE      261
SGOT                36
ALBUMIN            144
PROTIME            603
HISTOLOGY            0
Class                0
dtype: int64

In [8]:
numerical_columns = ["BILIRUBIN", "ALK_PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]
for df in train_dfs + test_dfs:
    for col in numerical_columns:
        median_imputer = SimpleImputer(strategy="median")
        standard_scaler = StandardScaler()
        df[col] = standard_scaler.fit_transform(median_imputer.fit_transform(df[[col]]))

    for col in list(set(df.columns) - set(numerical_columns)):
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])

train_dfs[0].head()

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,16,0,1,0,2,2,2,2,2,1,2,2,2,-0.180812,0.396382,-0.773678,-1.659874,-0.02558,1,0
1,32,0,1,0,2,2,1,2,1,1,1,2,1,-0.180812,-0.328604,0.072143,-1.177644,0.323461,1,1
2,35,1,2,0,2,1,1,2,2,1,2,1,1,-0.011974,1.64863,-0.339639,-1.820617,-1.421744,0,1
3,3,0,2,0,1,1,1,0,0,0,0,0,0,2.689431,-0.965713,-0.773678,1.233505,-0.02558,0,1
4,10,0,2,0,1,1,1,2,1,1,1,1,1,-0.34965,-0.328604,-0.729162,0.269046,2.30136,0,1


In [9]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_dfs[0].drop("Class", axis=1), train_dfs[0]["Class"])
preds = decision_tree.predict(test_dfs[0].drop("Class", axis=1))
print(classification_report(test_dfs[0]["Class"], preds))


              precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.82      0.82      0.82        11

    accuracy                           0.71        14
   macro avg       0.58      0.58      0.58        14
weighted avg       0.71      0.71      0.71        14

