## Imports

In [27]:
from scipy.io import arff
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#import numpy as np
#import scipy
#import sklearn
#import matplotlib.pyplot as plt
#import seaborn as sns

## Data

In [28]:
# Function to load and combine ARFF files from a given dataset directory
def load_and_combine_arff_files(dataset_directory):
    # Get the current working directory
    current_directory = os.getcwd()
    
    # Join the current directory with the dataset directory
    full_directory = os.path.join(current_directory, dataset_directory.lstrip('/'))

    # Initialize list to hold all the combined data
    all_data = []

    # Loop through the files in the directory
    for filename in os.listdir(full_directory):
        # Check if the file is an ARFF file and contains 'train' or 'test' in the name
        if filename.endswith(".arff"):
            filepath = os.path.join(full_directory, filename)
            
            # Load the ARFF file
            data, meta = arff.loadarff(filepath)
            
            # Convert to DataFrame and decode byte strings
            df = pd.DataFrame(data)
            df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)
            
            # Extract fold number and file type (train/test) from the filename
            fold_number = int(filename.split('.')[2])  # '000001' becomes 1
            file_type = filename.split('.')[3]  # 'test' or 'train'
            
            # Add columns to indicate fold number and whether it's train or test data
            df['fold'] = fold_number
            df['set'] = 'train' if file_type == 'train' else 'test'
            
            # Append to the list
            all_data.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)
    
    return combined_df

# Combine mushroom dataset
mushroom_dataset_directory = '/datasetsCBR/mushroom'
df_combined_mushroom = load_and_combine_arff_files(mushroom_dataset_directory)

# Combine hepatitis dataset
hepatitis_dataset_directory = '/datasetsCBR/hepatitis'
df_combined_hepatitis = load_and_combine_arff_files(hepatitis_dataset_directory)


In [34]:
df_combined_hepatitis

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,...,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class,fold,set
0,50.0,female,no,no,yes,no,no,no,no,no,...,no,0.9,135.0,42.0,3.5,,no,LIVE,3,train
1,45.0,male,no,no,yes,yes,no,yes,no,yes,...,no,1.0,85.0,75.0,,,no,LIVE,3,train
2,54.0,female,no,no,yes,yes,no,?,?,yes,...,no,3.9,120.0,28.0,3.5,43.0,yes,DIE,3,train
3,35.0,female,no,no,yes,no,no,?,?,yes,...,no,1.5,138.0,58.0,2.6,,yes,DIE,3,train
4,24.0,female,no,no,yes,no,no,yes,no,no,...,no,1.0,,34.0,4.1,,yes,LIVE,3,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,33.0,female,no,no,yes,yes,no,yes,no,no,...,no,0.7,63.0,80.0,3.0,31.0,yes,DIE,0,train
1546,31.0,female,?,yes,no,no,no,yes,no,no,...,no,0.7,46.0,52.0,4.0,80.0,no,LIVE,0,train
1547,78.0,female,yes,no,yes,no,no,yes,no,no,...,no,0.7,96.0,32.0,4.0,,no,LIVE,0,train
1548,34.0,female,yes,no,no,no,no,yes,no,no,...,no,0.9,95.0,28.0,4.0,75.0,no,LIVE,0,train


## Data Pre-Processing 

### Hepatitis

In [29]:
def preprocess_hepatitis_data(df):
    # List of columns to ignore during preprocessing but keep in the output
    ignored_cols = ['fold', 'set']
    
    # Identify numerical and categorical columns (excluding 'fold' and 'set')
    numerical_cols = ['AGE', 'BILIRUBIN', 'ALK_PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']
    categorical_cols = ['SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER_BIG', 
                        'LIVER_FIRM', 'SPLEEN_PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'HISTOLOGY']
    
    # Define transformers for MISSING VALUES
    numerical_imputer = SimpleImputer(strategy='median')  # Impute missing values with the median
    categorical_imputer = SimpleImputer(strategy='most_frequent')  # Impute missing categorical values with most frequent

    # Define transformers for DIFFERENT RANGES
    numerical_transformer = Pipeline(steps=[
        ('imputer', numerical_imputer),  # Fill missing values with the median
        ('scaler', MinMaxScaler())       # Scale numerical features between 0 and 1
    ])
    # Define transformers for DIFFERENT TYPES
    categorical_transformer = Pipeline(steps=[
        ('imputer', categorical_imputer),  # Fill missing values with the most frequent category
        ('encoder', OneHotEncoder(drop='first'))  # One-hot encode categorical features
    ])
    
    # Create a preprocessor to apply transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols),
            # Pass-through for ignored columns
            ('ignore', 'passthrough', ignored_cols)
        ])
    
    # Fit and transform the features
    processed = preprocessor.fit_transform(df)
    
    # Get the transformed feature names for the categorical columns
    cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
    
    # Combine the names of all columns: numerical, one-hot encoded, and ignored
    all_feature_names = numerical_cols + list(cat_feature_names) + ignored_cols
    
    # Convert the processed array back to a DataFrame with appropriate column names
    processed_df = pd.DataFrame(processed, columns=all_feature_names, index=df.index)
        
    # Return the final DataFrame with features and target
    return processed_df

hepatitis_processed_df = preprocess_hepatitis_data(df_combined_hepatitis)

In [30]:
hepatitis_processed_df

Unnamed: 0,AGE,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,SEX_male,STEROID_no,STEROID_yes,ANTIVIRALS_yes,...,SPLEEN_PALPABLE_yes,SPIDERS_no,SPIDERS_yes,ASCITES_no,ASCITES_yes,VARICES_no,VARICES_yes,HISTOLOGY_yes,fold,set
0,0.605634,0.077922,0.405204,0.044164,0.325581,0.61,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3,train
1,0.535211,0.090909,0.219331,0.096215,0.44186,0.61,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3,train
2,0.661972,0.467532,0.349442,0.022082,0.325581,0.43,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,3,train
3,0.394366,0.155844,0.416357,0.069401,0.116279,0.61,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,3,train
4,0.239437,0.090909,0.219331,0.031546,0.465116,0.61,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,3,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,0.366197,0.051948,0.137546,0.104101,0.209302,0.31,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0,train
1546,0.338028,0.051948,0.074349,0.059937,0.44186,0.8,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,train
1547,1.0,0.051948,0.260223,0.028391,0.44186,0.61,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,train
1548,0.380282,0.077922,0.256506,0.022082,0.44186,0.75,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,train


### Mushroom

In [31]:
def preprocess_mushroom_data(df):
    # List of columns to ignore during preprocessing but keep in the output
    ignored_cols = ['fold', 'set']
    
    # Identify numerical and categorical columns (excluding 'fold' and 'set')
    categorical_cols =[
    'cap-shape', 
    'cap-surface', 
    'cap-color', 
    'bruises?', 
    'odor', 
    'gill-attachment', 
    'gill-spacing', 
    'gill-size', 
    'gill-color', 
    'stalk-shape', 
    'stalk-root', 
    'stalk-surface-above-ring', 
    'stalk-surface-below-ring', 
    'stalk-color-above-ring', 
    'stalk-color-below-ring', 
    'veil-type', 
    'veil-color', 
    'ring-number', 
    'ring-type', 
    'spore-print-color', 
    'population', 
    'habitat', 
    'class'
]

    # Define transformers for MISSING VALUES
    categorical_imputer = SimpleImputer(strategy='most_frequent')  # Impute missing categorical values with most frequent

    # Define transformers for DIFFERENT TYPES
    categorical_transformer = Pipeline(steps=[
        ('imputer', categorical_imputer),  # Fill missing values with the most frequent category
        ('encoder', OneHotEncoder(drop='first', sparse_output=False))  # One-hot encode categorical features with dense output
    ])
    
    # Create a preprocessor to apply transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            # Pass-through for ignored columns
            ('ignore', 'passthrough', ignored_cols)
        ])
    
    # Fit and transform the features
    processed = preprocessor.fit_transform(df)
    
    # Get the transformed feature names for the categorical columns
    cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
    
    # Combine the names of all columns: numerical, one-hot encoded, and ignored
    all_feature_names = list(cat_feature_names) + ignored_cols
    
    # Convert the processed array back to a DataFrame with appropriate column names
    processed_df = pd.DataFrame(processed, columns=all_feature_names, index=df.index)
        
    # Return the final DataFrame with features and target
    return processed_df

mushroom_processed_df = preprocess_mushroom_data(df_combined_mushroom)

In [32]:
mushroom_processed_df

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,class_p,fold,set
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2,test
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2,test
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,test
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2,test
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81235,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2,train
81236,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2,train
81237,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,train
81238,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2,train
