O código implementado define a classe Dataset, que pode ser usada para manipular dados de conjuntos de dados.

A classe inclui métodos para ler dados de arquivos CSV ou TSV, escrever dados em arquivos CSV ou TSV, descrever as colunas do conjunto de dados (por exemplo, a média, mediana, desvio padrão, etc.), substituir valores em colunas, contar valores nulos em colunas, e substituir valores nulos com a média das colunas. Além disso, a classe inclui métodos para obter ou definir os valores de X, y, features, e label.

In [1]:
import numpy as np


class Dataset:

    def __init__(self):
        self.X = np.array([])
        self.y = np.array([])
        self.features = []
        self.label = ''

    def get_X(self):
        return self.X

    def set_X(self, new):
        self.X = new

    def get_y(self):
        return self.y

    def get_features(self):
        return self.features

    def get_label(self):
        return self.label

    def read_csv(self, file, label=None, delimiter=','):
        data = np.genfromtxt(file, delimiter=delimiter, names=True, dtype=None, encoding=None)
        self.features = list(data.dtype.names)
        self.label = self.features[len(self.features) - 1]
        self.features.remove(self.label)
        self.X = np.vstack([data[f] for f in self.features]).T
        self.y = data[self.label]

    def read_tsv(self, file, label=None):
        self.read_csv(file, label, '\t')

    def write_csv(self, file, delimiter=','):
        data = np.hstack((self.X, self.y.reshape(-1, 1)))
        header = self.features + [self.label]
        fmt = ["%.18e" if col.dtype.kind in {'f', 'c'} else "%s" for col in data.T]
        np.savetxt(file, data, delimiter=delimiter, header=delimiter.join(header), fmt=fmt, comments='')

    def write_tsv(self, file):
        self.write_csv(file, '\t')

    def describe(self):
        for i, feature in enumerate(self.features):
            print(feature)
            collumn = self.X[:, i] if i < self.X.shape[1] else self.y
            if np.issubdtype(collumn.dtype, np.number):
                print(" Mean: ", np.nanmean(collumn))
                print(" Median: ", np.nanmedian(collumn))
                print(" Standard Deviation: ", format(np.std(collumn), '.4f'))
                print(" Minimum: ", np.nanmin(collumn))
                print(" Maximum: ", np.nanmax(collumn))
            else:
                unique_vals, counts = np.unique(collumn[collumn == collumn], return_counts=True)
                print(" - Number of Unique Values: ", len(unique_vals))
                print(" - Most Frequent Value: ", unique_vals[np.argmax(counts)])
                

    def replace_to_null(self, value):
        # Replaces the chosen number with null
        self.X = np.where(self.X == value, np.nan, self.X)
        self.y = np.where(self.y == value, np.nan, self.y)

    def replace_nulls(self, value):
        # Replaces every null value with the one chosen
        self.X = np.where(self.X != self.X, value, self.X)
        self.y = np.where(self.y != self.y, value, self.y)

    def count_nulls(self):
        null_count = np.zeros(self.X.shape[1] + 1, dtype=int)
        for i in range(self.X.shape[0]):
            for j in range(self.X.shape[1]):
                val = self.X[i, j]
                if val == '' or val is None or (np.issubdtype(type(val), np.number) and np.isnan(val)):
                    null_count[j] += 1
        if self.label is not None:
            for i in range(len(self.y)):
                val = self.y[i]
                if val == '' or val is None or (np.issubdtype(type(val), np.number) and np.isnan(val)):
                    null_count[-1] += 1
        for i, feature in enumerate(self.features + [self.label] if self.label else []):
            print("{} Valores nulos: {}".format(feature, null_count[i]))
            if null_count[i] == len(self.X if i < len(self.features) else self.y):
                print(" Todos os valores são nulos.")


    def replace_nulls_with_mean(self):
        for i, feature in enumerate(self.features):
            var = self.X[:, i]
            if np.issubdtype(var.dtype, np.number):
                mean = np.nanmean(var)
                self.X[:, i] = np.where(np.isnan(var), mean, var)
            else:
                mode = self.most_frequent(var)
                self.X[:, i] = np.where(var != var, mode, var)

        if self.label is not None:
            var = self.y
            if np.issubdtype(var.dtype, np.number):
                mean = np.nanmean(var)
                self.y = np.where(np.isnan(var), mean, var)
            else:
                mode = self.most_frequent(var)
                self.y = np.where(var != var, mode, var)

    def most_frequent(self, arr):
          unique_vals, counts = np.unique(arr[arr == arr], return_counts=True)
          return unique_vals[np.argmax(counts)]

**Exemplo de uso**

In [2]:
# Criação de um objeto da classe Dataset
dataset = Dataset()

# Criação de um array com dados fictícios
data = np.array([(1, 2, 3, 4, 'A'),
                 (2, 3, 4, np.nan, 'B'),
                 (3, 4, np.nan, np.nan, 'A'),
                 (4, np.nan, np.nan, np.nan, 'B')],
                dtype=[('Feature 1', 'f8'),
                       ('Feature 2', 'f8'),
                       ('Feature 3', 'f8'),
                       ('Feature 4', 'f8'),
                       ('Target', 'U1')])

# Definição dos dados como atributos do objeto da classe
dataset.X = np.vstack([data[f] for f in data.dtype.names[:-1]]).T
dataset.y = data['Target']
dataset.features = list(data.dtype.names[:-1])
dataset.label = 'Target'

# Visualização das estatísticas descritivas dos dados
dataset.describe()

# Substituição de valores nulos pela sua média
dataset.replace_nulls_with_mean()

# Contagem de valores nulos após as substituições
dataset.count_nulls()

Feature 1
 Mean:  2.5
 Median:  2.5
 Standard Deviation:  1.1180
 Minimum:  1.0
 Maximum:  4.0
Feature 2
 Mean:  3.0
 Median:  3.0
 Standard Deviation:  nan
 Minimum:  2.0
 Maximum:  4.0
Feature 3
 Mean:  3.5
 Median:  3.5
 Standard Deviation:  nan
 Minimum:  3.0
 Maximum:  4.0
Feature 4
 Mean:  4.0
 Median:  4.0
 Standard Deviation:  nan
 Minimum:  4.0
 Maximum:  4.0
Feature 1 Valores nulos: 0
Feature 2 Valores nulos: 0
Feature 3 Valores nulos: 0
Feature 4 Valores nulos: 0
Target Valores nulos: 0
