In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

class Dataset:
    def __init__(self, X=None, y=None, feature_names=None, label_name=None):
        self.X = np.array(X) if X is not None else np.array([])
        self.y = np.array(y) if y is not None else np.array([])
        self.feature_names = np.array(feature_names) if feature_names is not None else np.array([])
        self.label_name = label_name

    # Métodos getters e setters
    def set_X(self, X):
        self.X = np.array(X)

    def set_y(self, y):
        self.y = np.array(y)

    def set_feature_names(self, feature_names):
        self.feature_names = np.array(feature_names)

    def set_label_name(self, label_name):
        self.label_name = label_name

    def get_X(self):
        return self.X

    def get_y(self):
        return self.y

    def get_feature_names(self):
        return self.feature_names

    def get_label_name(self):
        return self.label_name

    # Leitura de um dataset de CSV/TSV
    def read_csv(self, filepath, delimiter=','):
        df = pd.read_csv(filepath, delimiter=delimiter)
        self.X = df.drop(columns=[self.label_name]).values
        self.y = df[self.label_name].values
        self.feature_names = df.drop(columns=[self.label_name]).columns.values

    def read_tsv(self, filepath):
        self.read_csv(filepath, delimiter='\t')

    # Escrita de um dataset de CSV/TSV
    def write_csv(self, filepath, delimiter=','):
        df = pd.concat([self.X, self.y], axis=1)
        df.to_csv(filepath, index=False, sep=delimiter)
        
    def write_tsv(self, filepath):
        self.write_csv(filepath, delimiter='\t')

    # Estatísticas sobre as variáveis
    def describe(self):
        df = pd.DataFrame(self.X, columns=self.feature_names)
        return df.describe()

    # Contagem de valores nulos
    def count_missing_values(self):
        df = pd.DataFrame(self.X, columns=self.feature_names)
        return df.isnull().sum()
    
    # Substituição de valores nulos por uma constante
    def replace_missing_values(self):
        df = pd.DataFrame(self.X, columns=self.feature_names)
        imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        for column in df.columns:
            if df[column].dtype == 'object':
                imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            else:
                imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            df[column] = imputer.fit_transform(df[[column]])


Utilização de um dataset titanic.csv para teste de aplicação

In [2]:
titanic_data = Dataset()
titanic_data.set_label_name('Survived')
titanic_data.read_csv('titanic.csv')
X = titanic_data.get_X()
y = titanic_data.get_y()
feature_names = titanic_data.get_feature_names()
label_name = titanic_data.get_label_name()

In [3]:
print("Matriz X:")
print(X[:5])

Matriz X:
[[1 3 'Braund, Mr. Owen Harris' 'male' 22.0 1 0 'A/5 21171' 7.25 nan 'S']
 [2 1 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)' 'female' 38.0
  1 0 'PC 17599' 71.2833 'C85' 'C']
 [3 3 'Heikkinen, Miss. Laina' 'female' 26.0 0 0 'STON/O2. 3101282' 7.925
  nan 'S']
 [4 1 'Futrelle, Mrs. Jacques Heath (Lily May Peel)' 'female' 35.0 1 0
  '113803' 53.1 'C123' 'S']
 [5 3 'Allen, Mr. William Henry' 'male' 35.0 0 0 '373450' 8.05 nan 'S']]


In [4]:
print("Vetor y:")
print(y[:5])

Vetor y:
[0 1 1 1 0]


In [5]:
print("Nomes das features:")
print(feature_names)

Nomes das features:
['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


In [6]:
print("Nome da label:")
print(label_name)

Nome da label:
Survived


In [7]:
print(titanic_data.describe())

        PassengerId  Pclass                     Name   Sex    Age  SibSp  \
count           891     891                      891   891  714.0    891   
unique          891       3                      891     2   88.0      7   
top               1       3  Braund, Mr. Owen Harris  male   24.0      0   
freq              1     491                        1   577   30.0    608   

        Parch  Ticket    Fare    Cabin Embarked  
count     891     891  891.00      204      889  
unique      7     681  248.00      147        3  
top         0  347082    8.05  B96 B98        S  
freq      678       7   43.00        4      644  


In [8]:
print(titanic_data.count_missing_values())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [9]:
titanic_data.replace_missing_values()