In [1]:
# Dependencies 
import pandas as pd
import os 
import numpy as np 

import sys 
sys.path.append('..')

import openpyxl

from typing import Tuple

In [2]:
def prepare_SACardio(dataset_path : str = "", filename : str = "") -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, str]:
    """Read the South Africa disease dataset from a .txt file and suit it to be processed 
    from .arff format to as a pd.DataFrame. Some transformations are applied to handle 
    this dataset properly (i.e., from "Present" to "1" and "Absent" to "0", etc. 
    This converted DataFrame is returned. 

    Args:
    -----
            dataset_path: path where dataset is stored. Set by default.
            filename : file name of the .arff containing the dataset. Set by default.

    Returns:
    --------
            data: dataframe containing the whole dataset
            X : dataframe containing the dataset features
            Y : dataframe containing only the target variable
            cols_names: list of strings containing feature names. 
            y_tag: string containing target variable name.
    """

    # Go to dataset path
    os.chdir(dataset_path)

    # Open the .csv file and convert it into DataFrame
    data = pd.read_csv(filename, sep=";", header='infer')

    # Convert "Present" and "No" (and equivalent) values into "1" and "0" 
    data.replace(('Present', 'Absent'), (1, 0), inplace=True)

    # Drop ID column 
    data = data.drop(['ind'], axis=1)

    # Store column names 
    cols_names = data.columns

    # Store features' and target variable's names 
    cols_names_prev = data.columns
    y_tag = cols_names_prev[len(cols_names_prev)-1]
    cols_names = cols_names_prev[0:cols_names_prev.size]

    # Save X, Y, feature names and Y name 
    y_tag = cols_names[len(cols_names)-1]
    cols_names = cols_names[0:len(cols_names)-1]
    X = data[cols_names]
    Y = data[y_tag]
    
    return data, X, Y, cols_names, y_tag

In [3]:
# Dataset path
DATASET_PATH = r"C:\Users\Anto\OneDrive - Universidad de Las Palmas de Gran Canaria\Doctorado\Bases de datos\Cardiovascular\SouthAfrica"

# File name 
filename = "cardiovascular.txt"

# Save working directory to return to it 
wd = os.getcwd()

# Prepare Alzheimer-Balea database to be handled
data, X, Y, feat_names, y_tag = prepare_SACardio(dataset_path = DATASET_PATH, filename = filename)

print(Y.value_counts()*100/len(Y))
print(Y.sum())

0    65.367965
1    34.632035
Name: chd, dtype: float64
160
