# Pre-processing file

In [4]:
# Imports
import numpy as np
import pandas as pd
import scipy
import matplotlib
import sklearn
import seaborn
from scipy.io import arff
import os

In [10]:
# Functions relating to loading data

# Function to return full project path
def get_full_path():
    return os.getcwd()

# Function to load ARFF files and return a pandas DataFrame
def load_arff_file(file_path):
    data, meta = arff.loadarff(file_path)
    return pd.DataFrame(data)

# Function to load all ARFF files in a directory
def load_all_arff_files(directory):
    # List to hold all data relating to a dataset
    all_data = [] 
    
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".arff"): 
            file_path = os.path.join(directory, filename)

            # Load the arff file
            df = load_arff_file(file_path)

            # Add the resulting df to the list
            all_data.append(df) 
    
    # Return the list of DataFrames for further use
    return all_data

def combine_arff_files(dataset):
    return pd.concat(dataset, ignore_index=True)

In [23]:
# Function to compute the percentage of rows with missing values
def percentage_missing_values(df):
    # Total rows
    total_rows = len(df)
    
    # Rows with any missing values (returns a boolean Series)
    missing_rows_mask = df.isnull().any(axis=1)
    
    # Number of rows with missing values
    missing_rows_count = missing_rows_mask.sum()
    
    # Percentage of rows with missing values
    percentage = (missing_rows_count / total_rows) * 100
    
    print(f"Percentage of rows with missing values: {percentage:.2f}%")
    return percentage

proj_path = get_full_path()

dataset_one_dir = f"{proj_path}\\datasetsCBR\\hepatitis" # Change end of path to whichever 2 datasets we end up choosing
dataset_two_dir = f"{proj_path}\\datasetsCBR\\mushroom" # Change end of path to whichever 2 datasets we end up choosing

# Load all of the arff files associated with a dataset
arff_files_data_one = load_all_arff_files(dataset_one_dir)
arff_files_data_two = load_all_arff_files(dataset_two_dir)

# Combine all of the arff files into 1 dataset
combined_dataset_one = combine_arff_files(arff_files_data_one)
combined_dataset_two = combine_arff_files(arff_files_data_two)

# Calculate the percentage of rows with missing values
missing_values_one = percentage_missing_values(combined_dataset_one)
missing_values_two = percentage_missing_values(combined_dataset_two)

#print(f"Percentage of rows that have missing values in 'adult' dataset: {missing_values_one}")
#print(f"Percentage of rows that have missing values in 'mushroom' dataset: {missing_values_two}")

# Things to consider:
# - Need a definitive set of datasets to know how to pre-process the data
# - Need to define what is considered null value (e.g.: NaN?)
# - What to do with missing values
# - Normalisation (need actual datasets)

Percentage of rows with missing values: 46.45%
Percentage of rows with missing values: 0.00%
