# Pre-Processing

Notebook file that loads the datasets and pre-processes the rows of each dataset accordingly.

- Author: Kacper Poniatowski
- Date: 07/10/2024

In [3]:
# Imports
import numpy as np
import pandas as pd
import scipy
import matplotlib
import sklearn
import seaborn
from scipy.io import arff
import os

In [1]:
# Functions relating to loading and pre-processing data

# Function to return full project path
def get_full_path():
    return os.getcwd()

# Function to load ARFF files and return a pandas DataFrame
def load_arff_file(file_path):
    data, meta = arff.loadarff(file_path)
    return pd.DataFrame(data)

# Function to load all ARFF files in a directory
def load_all_arff_files(directory):
    all_data = [] 
    
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".arff"): 
            file_path = os.path.join(directory, filename)

            df = load_arff_file(file_path)

            # Add the resulting df to the list
            all_data.append(df) 
    
    return all_data

# Function to combine all of the arff files that are part of a dataset
def combine_arff_files(dataset):
    return pd.concat(dataset, ignore_index=True)

# Function to compute the percentage of rows with missing values
def percentage_missing_values(df):
    total_rows = len(df)

    # Rows with any missing values (returns a boolean Series)
    missing_rows_mask = df.isnull().any(axis=1)

    print("Rows with missing / empty / values:")
    print(f"\n{total_rows}")
    
    # Number of rows with missing values
    missing_rows_count = missing_rows_mask.sum()

    # TODO: Return a txt file containing all of the rows which have missing rows.
    # Inspect manually what is missing, and decide upon whether to remove the rows or 
    # add data (e.g.: median for that column).

    # If adding values via median, need to calculate the median values for each column
    # Perhaps in a new function, called at the beginning of this function.

    # Also need to decide upon a normalisation strategy, and conversion of data to 1 type (continuous or categorical)
    # I need to see what sort of data it is but I'm assuming it'll be continuous.
    # If so, need to investigate what normalisation technique is ideal for continuous data.
    # Also need to keep note in this notebook what exactly is being performed on the data (conversions, normalisations, etc...)
    
    return (missing_rows_count / total_rows) * 100 # Convert to percentage

In [4]:
# Constants

PROJECT_PATH = get_full_path()
HEPATITIS_DATASET_DIR = f"{PROJECT_PATH}\\datasetsCBR\\hepatitis"
MUSHROOM_DATASET_DIR = f"{PROJECT_PATH}\\datasetsCBR\\mushroom"

In [5]:
# Loading the data

# Load all of the arff files associated with a dataset
arff_files_data_hepa = load_all_arff_files(HEPATITIS_DATASET_DIR)
arff_files_data_mush = load_all_arff_files(MUSHROOM_DATASET_DIR)

# Combine all of the arff files into 1 dataset
combined_dataset_hepa = combine_arff_files(arff_files_data_hepa)
combined_dataset_mush = combine_arff_files(arff_files_data_mush)

# Calculate the percentage of rows with missing values
missing_values_hepa = percentage_missing_values(combined_dataset_hepa)
missing_values_mush = percentage_missing_values(combined_dataset_mush)

#print(f"Percentage of rows that have missing values in 'hepatitis' dataset: {missing_values_hepa}")
#print(f"Percentage of rows that have missing values in 'mushroom' dataset: {missing_values_mush}")

1550
Rows with missing / empty / values:

1550
81240
Rows with missing / empty / values:

81240
