In [3]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from PIL import Image
import os
import re

class Data():
    
    def __init__(self):
        # Define the split proportions
        self.train_ratio = 0.6  # 60% for training
        self.val_ratio = 0.2    # 20% for validation
        self.test_ratio = 0.2   # 20% for testing
    
    def min_max_normalisation(self, col):
        minn = col.min()
        maxx = col.max()
        return (col - minn) / (maxx - minn)
    
    def get_heart_disease_data(self):
        # fetch dataset 
        heart_disease = fetch_ucirepo(id=45) 

        # data (as pandas dataframes) 
        X = heart_disease.data.features 
        y = heart_disease.data.targets 
        
        # normalization
        minX = X.min()
        maxX = X.max()
        X = X.apply(self.min_max_normalisation)
        
        #preprocess
        null_indices = X[X.isna().any(axis=1)].index
        X = X.drop(null_indices)
        y = y.drop(null_indices)
        y = (y > 0).astype(int)
        
        total_samples = len(X)
        train_size = int(self.train_ratio * total_samples)
        val_size = int(self.val_ratio * total_samples)

        self.X_train = X.iloc[:train_size, :].to_numpy()  # Features for training
        self.y_train = y.iloc[:train_size, :].to_numpy()    # Target for training
        self.X_val = X.iloc[train_size:train_size+val_size, :].to_numpy()  # Features for validation
        self.y_val = y.iloc[train_size:train_size+val_size, :].to_numpy()    # Target for validation
        self.X_test = X.iloc[train_size+val_size:, :].to_numpy()  # Features for testing
        self.y_test = y.iloc[train_size+val_size:, :].to_numpy()    # Target for testing
        
        print("data loading done")
        
    
    def get_age_prediction_data(self):
        
        directory = "C:\\Users\\agata\\Documents\\pwr\\sem7\\neural networks\\lab1\\data\\FGNET\\FGNET\\images"
        
        x = []
        y = []
        for filename in os.listdir(directory):
            f = os.path.join(directory, filename)
            
            # checking if it is a file
            if os.path.isfile(f):
                age = re.findall("A(\d+)\D?.JPG", filename)
                if not age: continue
                    
                image = Image.open(f, 'r')
                image = image.resize((64, 48))
            
                if image.mode == "RGB":
                    image = image.convert('L')
                elif image.mode == "L":
                    # no need to convert
                    pass
                else:
                    # incorrect format, move on
                    continue
                
                pixel_values = list(image.getdata())
                
                # -- Normalization
                pixel_values = [i//255 for i in pixel_values]
                
                y.append(int(age[0]))
                x.append(pixel_values)
        
        total_samples = len(x)
        train_size = int(self.train_ratio * total_samples)
        val_size = int(self.val_ratio * total_samples)
                
        self.X_train = np.array(x[:train_size])  # Features for training
        self.y_train = np.array(y[:train_size])    # Target for training
        self.X_val = np.array(x[train_size:train_size+val_size])  # Features for validation
        self.y_val = np.array(y[train_size:train_size+val_size])    # Target for validation
        self.X_test = np.array(x[train_size+val_size:])  # Features for testing
        self.y_test = np.array(y[train_size+val_size:])    # Target for testing
        
        print("data loading done")