Implementação da classe Dataset

In [None]:
#importing the pandas and numpy libraries.
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [None]:
class Dataset:
    
    def __init__(self):
        self.X = pd.DataFrame()
        self.y = pd.Series()
        self.feature_names = []
        self.label_name = ''
    
    # Métodos getters e setters
    def set_X(self, X):
        self.X = X
    def set_y(self, y):
        self.y = y
    def set_feature_names(self, feature_names):
        self.feature_names = feature_names
    def set_label_name(self, label_name):
        self.label_name = label_name
    def get_X(self):
        return self.X
    def get_y(self):
        return self.y
    def get_feature_names(self):
        return self.feature_names
    def get_label_name(self):
        return self.label_name
    
    # Leitura de um dataset de CSV/TSV
    def read_csv(self, filepath, delimiter=','):
        df = pd.read_csv(filepath, delimiter=delimiter)
        self.X = df.drop(self.label_name, axis=1)
        self.y = df[self.label_name]
        self.feature_names = list(self.X.columns)
        
    def read_tsv(self, filepath):
        self.read_csv(filepath, delimiter='\t')
        
    # Escrita de um dataset de CSV/TSV
    def write_csv(self, filepath, delimiter=','):
        df = pd.concat([self.X, self.y], axis=1)
        df.to_csv(filepath, index=False, sep=delimiter)
        
    def write_tsv(self, filepath):
        self.write_csv(filepath, delimiter='\t')
    
    # Estatísticas sobre as variáveis
    def describe(self):
        return self.X.describe()
    
    # Contagens de valores nulos
    def count_missing_values(self):
        return self.X.isnull().sum()
    
    # Substituição de valores nulos por uma constante
    def replace_missing_values(self):
        imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        for column in self.X.columns:
            if self.X[column].dtype == 'object':
                imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            else:
                imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            self.X[column] = imputer.fit_transform(self.X[[column]])


Exemplo de aplicação com um Dataset titanic fornecido em DAA

In [None]:
titanic_data = Dataset()
titanic_data.set_label_name('Survived')
titanic_data.read_csv('titanic.csv')

In [None]:
print(titanic_data.describe())

In [None]:
print(titanic_data.count_missing_values())

In [None]:
titanic_data.replace_missing_values()

In [None]:
titanic_data.write_csv('titanic_clean.csv')