In [1]:
import numpy as np

In [2]:
class Dataset:
    
    def __init__(self, X=None, y=None, feature_names=None, label_name=None):
        self.X = X
        self.y = y
        self.feature_names = feature_names
        self.label_name = label_name
    
    def get_X(self):
        return self.X
    
    def set_X(self, X):
        self.X = X
    
    def get_y(self):
        return self.y
    
    def set_y(self, y):
        self.y = y
    
    def get_feature_names(self):
        return self.feature_names
    
    def set_feature_names(self, feature_names):
        self.feature_names = feature_names
    
    def get_label_name(self):
        return self.label_name
    
    def set_label_name(self, label_name):
        self.label_name = label_name
    
    def read_csv(self, file_path, delimiter=',', header=True):
        data = np.genfromtxt(file_path, delimiter=delimiter, dtype=None, names=header)
        
        if header:
            self.X = np.array([tuple(row)[:-1] for row in data])
            self.y = np.array([tuple(row)[-1] for row in data])
            self.feature_names = list(data.dtype.names[:-1])
            self.label_name = data.dtype.names[-1]
        else:
            self.X = data
            self.y = None
            self.feature_names = None
            self.label_name = None
        
    def write_csv(self, file_path, delimiter=',', header=True):
        if header:
            data = np.column_stack((self.X, self.y))
            np.savetxt(file_path, data, delimiter=delimiter, header=','.join(self.feature_names) + ',' + self.label_name, comments='')
        else:
            np.savetxt(file_path, self.X, delimiter=delimiter)
    
    def describe(self):
        if self.X is None:
            print('No data available.')
        else:
            for i in range(self.X.shape[1]):
                feature_values = self.X[:, i]
                print('Feature name:', self.feature_names[i])
                print('Type:', feature_values.dtype)
                print('Minimum:', np.min(feature_values))
                print('Maximum:', np.max(feature_values))
                print('Mean:', np.mean(feature_values))
                print('Standard deviation:', np.std(feature_values))
                print('Missing values:', np.sum(np.isnan(feature_values)))
                print('Unique values:', np.unique(feature_values))
    
    def count_nulls(self):
        if self.X is None:
            print('No data available.')
        else:
            for i in range(self.X.shape[1]):
                feature_values = self.X[:, i]
                print('Feature name:', self.feature_names[i])
                print('Number of nulls:', np.sum(np.isnan(feature_values)))
    
    def replace_nulls(self, method='most_common'):
        if self.X is None:
            print('No data available.')
        else:
            for i in range(self.X.shape[1]):
                feature_values = self.X[:, i]
                
                if method == 'most_common':
                    most_common = np.bincount(feature_values.astype(int)).argmax()
                    feature_values[np.isnan(feature_values)] = most_common
                elif method == 'mean':
                    mean = np.mean(feature_values[~np.isnan(feature_values)])
                    feature_values[np.isnan(feature_values)] = mean
                
                self.X[:, i] = feature_values


In [3]:
# Criando um dataset de exemplo
X = np.array([[1, 2, 3], [4, 5, np.nan], [7, 8, 9]])
y = np.array([0, 1, 0])
feature_names = ['feature_1', 'feature_2', 'feature_3']
label_name = 'label'
dataset = Dataset(X=X, y=y, feature_names=feature_names, label_name=label_name)

# Imprimindo o dataset
print('X:')
print(dataset.X)
print('y:')
print(dataset.y)
print('Feature names:')
print(dataset.feature_names)
print('Label name:')
print(dataset.label_name)

X:
[[ 1.  2.  3.]
 [ 4.  5. nan]
 [ 7.  8.  9.]]
y:
[0 1 0]
Feature names:
['feature_1', 'feature_2', 'feature_3']
Label name:
label


In [4]:
dataset.describe()

Feature name: feature_1
Type: float64
Minimum: 1.0
Maximum: 7.0
Mean: 4.0
Standard deviation: 2.449489742783178
Missing values: 0
Unique values: [1. 4. 7.]
Feature name: feature_2
Type: float64
Minimum: 2.0
Maximum: 8.0
Mean: 5.0
Standard deviation: 2.449489742783178
Missing values: 0
Unique values: [2. 5. 8.]
Feature name: feature_3
Type: float64
Minimum: nan
Maximum: nan
Mean: nan
Standard deviation: nan
Missing values: 1
Unique values: [ 3.  9. nan]


In [5]:
dataset.count_nulls()

Feature name: feature_1
Number of nulls: 0
Feature name: feature_2
Number of nulls: 0
Feature name: feature_3
Number of nulls: 1


In [6]:
dataset.replace_nulls(method='mean')
print('X after replacing nulls with mean:')
print(dataset.X)

X after replacing nulls with mean:
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
