In [136]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import glob
import pandas as pd
from scipy.io import arff
import logging
import os
import pathlib

# Set up logger
logger = logging.getLogger(__name__)

In [187]:
SCRIPT_DIR = pathlib.Path(os.getcwd()).absolute()
DATA_DIR = os.path.join(SCRIPT_DIR.parent, "data")
PREPROCESSED_DATA_DIR = f'{DATA_DIR}/1_preprocessed/'

In [137]:
def identify_categorical_and_numerical(df, unique_threshold=0.05):
    """
    Identifies categorical and numerical columns in a DataFrame.

    Args:
        df: The pandas DataFrame.
        unique_threshold: The threshold for the proportion of unique values 
                           to consider a column categorical.

    Returns:
        A tuple containing two lists: categorical columns and numerical columns.
    """
    categorical_cols = []
    numerical_cols = []

    for col in df.columns:
        if df[col].dtype == 'object' or df[col].nunique() / len(df) <= unique_threshold:
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)

    return categorical_cols, numerical_cols

In [138]:
def load_datasets(file_pattern: str) -> list[pd.DataFrame]:
    files = glob.glob(file_pattern)

    # List to store the dataframes
    dfs = []

    # Loop through each matching file
    for file in files:
        # Load the ARFF file
        raw_data, meta = arff.loadarff(file)

        # Log the file being loaded
        logger.debug(f"Loading {file}")

        # Convert the ARFF data to a pandas DataFrame
        df = pd.DataFrame(raw_data, columns=meta.names())

        # Decode byte-strings if necessary
        df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)

        # Append dataframe to the list
        dfs.append(df)

    return dfs

In [None]:
def save_to_csv(df, filename):
    """Saves the DataFrame to a CSV file.

    Args:
      df: The pandas DataFrame to save.
      filename: The name of the CSV file (e.g., 'preprocessed_data.csv').
    """
    try:
        df.to_csv(PREPROCESSED_DATA_DIR / '' / filename, index=False)  # Set index=False to avoid saving the index
        print(f"DataFrame saved to {filename}")
    except Exception as e:
        print(f"Error saving DataFrame: {e}")

In [177]:
def preprocess_dataset(df):
    # Replace ? with nan for correct imputation
    df.replace("?", np.nan, inplace=True)

    # Drop class column as it's unsupervised
    class_col_name = df.columns[-1]
    print('last column', class_col_name)
    
    df.drop(class_col_name, axis=1, inplace=True)

    # Get categorical and numerical columns based on dtype and heuristics
    categorical_cols, numeric_cols = identify_categorical_and_numerical(df)
    print('AFTER METHOD')
    print('categorical columns', categorical_cols)
    print('numeric columns', numeric_cols)

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "num",
                Pipeline(
                    steps=[
                        ("imputer", SimpleImputer(strategy="mean")),  # Fill missing with mean
                        ("scaler", MinMaxScaler()),  # Min-Max scaling
                    ]
                ),
                numeric_cols,
            ),
            (
                "cat",
                Pipeline(
                    steps=[
                        (
                            "imputer",
                            SimpleImputer(strategy="most_frequent"),
                        ),  # Fill missing with mode
                        # Use 'passthrough' to handle categorical data
                        ("passthrough", "passthrough"),
                    ]
                ),
                categorical_cols,
            ),
        ],
    )

    # Apply the preprocessor to the dataframe, excluding ignored columns
    processed_array = preprocessor.fit_transform(df)

    # Convert processed array back to a DataFrame
    processed_df = pd.DataFrame(processed_array, columns=numeric_cols + categorical_cols)
    
     # Extract processed numerical data
    processed_numeric_df = pd.DataFrame(processed_array[:, :len(numeric_cols)], columns=numeric_cols)

    # One-hot encode categorical features
    encoder = OneHotEncoder(sparse_output=False)  
    encoded_array = encoder.fit_transform(processed_array[:, len(numeric_cols):])  # Encode categorical part

    # Rename encoded columns
    encoded_columns = []
    for i, cat_col in enumerate(categorical_cols):
        for j in range(encoder.categories_[i].size):
            encoded_columns.append(f"{cat_col}_{encoder.categories_[i][j]}")
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns)

    # Combine DataFrames
    final_df = pd.concat([processed_numeric_df, encoded_df], axis=1)

    return final_df

In [179]:

DATA_DIR

'/home/cajifan/Documents/MAI/1S/IML/mai-intro-to-machine-learning/work3/data'

In [192]:
data_path = f"{DATA_DIR}/datasets/hepatitis.arff"
dfs = load_datasets(data_path)

df = dfs[0]

preprocessed_df = preprocess_dataset(df)
save_to_csv(preprocess_dataset, f'{PREPROCESSED_DATA_DIR}/hepatitis.csv')
preprocessed_df

last column Class
AFTER METHOD
categorical columns ['SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER_BIG', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'HISTOLOGY']
numeric columns ['AGE', 'BILIRUBIN', 'ALK_PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']
Error saving DataFrame: 'function' object has no attribute 'to_csv'


Unnamed: 0,AGE,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,SEX_female,SEX_male,STEROID_no,STEROID_yes,...,SPLEEN_PALPABLE_no,SPLEEN_PALPABLE_yes,SPIDERS_no,SPIDERS_yes,ASCITES_no,ASCITES_yes,VARICES_no,VARICES_yes,HISTOLOGY_no,HISTOLOGY_yes
0,0.323944,0.090909,0.219331,0.006309,0.44186,0.618523,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.605634,0.077922,0.405204,0.044164,0.325581,0.618523,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.051948,0.260223,0.028391,0.44186,0.618523,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.338028,0.051948,0.074349,0.059937,0.44186,0.8,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.380282,0.090909,0.29489,0.293375,0.44186,0.618523,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,0.549296,0.948052,0.29489,0.359621,0.27907,0.5,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
151,0.521127,0.077922,0.371747,0.201893,0.511628,0.618523,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
152,0.760563,0.064935,0.182156,0.009464,0.465116,0.618523,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
153,0.647887,0.155844,0.204461,0.007886,0.465116,0.48,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [193]:
data_path = f"{DATA_DIR}/datasets/vowel.arff"
dfs = load_datasets(data_path)
df = dfs[0]

preprocessed_df = preprocess_dataset(df)
save_to_csv(preprocess_dataset, f'{PREPROCESSED_DATA_DIR}/vowel.csv')
preprocessed_df

last column Class
AFTER METHOD
categorical columns ['Train_or_Test', 'Speaker_Number', 'Sex']
numeric columns ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']
Error saving DataFrame: 'function' object has no attribute 'to_csv'


Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Speaker_Number_Nick,Speaker_Number_Penny,Speaker_Number_Rich,Speaker_Number_Rose,Speaker_Number_Sarah,Speaker_Number_Sue,Speaker_Number_Tim,Speaker_Number_Wendy,Sex_Female,Sex_Male
0,0.36815,0.266541,0.463757,0.84205,0.494947,0.778691,0.390816,0.546819,0.252909,0.281534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.441218,0.278828,0.457631,0.732699,0.47044,0.875435,0.39932,0.541116,0.339493,0.387516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.723888,0.341525,0.232517,0.410988,0.358767,0.757193,0.32585,0.590936,0.275154,0.530234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.684778,0.485665,0.252425,0.639461,0.271349,0.599431,0.329932,0.458583,0.520876,0.287711,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.611944,0.505986,0.418836,0.652668,0.124811,0.505849,0.656803,0.343037,0.646817,0.417425,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,0.461827,0.686358,0.270546,0.318806,0.466145,0.713563,0.718707,0.408463,0.23922,0.593953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
986,0.341452,0.768589,0.405309,0.174855,0.436079,0.822321,0.733673,0.614346,0.266256,0.448309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
987,0.28829,0.588059,0.652118,0.378236,0.597271,0.58963,0.458503,0.544418,0.287474,0.383615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
988,0.22178,0.661468,0.651353,0.581881,0.618999,0.427126,0.086735,0.346038,0.256674,0.419376,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
