Implementação da classe Dataset

In [12]:
#importação de bibliotecas
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [13]:
class Dataset:
    
    def __init__(self):
        self.X = pd.DataFrame()
        self.y = pd.Series()
        self.feature_names = []
        self.label_name = ''
    
    # Métodos getters e setters
    def set_X(self, X):
        self.X = X
    def set_y(self, y):
        self.y = y
    def set_feature_names(self, feature_names):
        self.feature_names = feature_names
    def set_label_name(self, label_name):
        self.label_name = label_name
    def get_X(self):
        return self.X
    def get_y(self):
        return self.y
    def get_feature_names(self):
        return self.feature_names
    def get_label_name(self):
        return self.label_name
    
    # Leitura de um dataset de CSV/TSV
    def read_csv(self, filepath, delimiter=','):
        df = pd.read_csv(filepath, delimiter=delimiter)
        self.X = df.drop(self.label_name, axis=1)
        self.y = df[self.label_name]
        self.feature_names = list(self.X.columns)
        
    def read_tsv(self, filepath):
        self.read_csv(filepath, delimiter='\t')
        
    # Escrita de um dataset de CSV/TSV
    def write_csv(self, filepath, delimiter=','):
        df = pd.concat([self.X, self.y], axis=1)
        df.to_csv(filepath, index=False, sep=delimiter)
        
    def write_tsv(self, filepath):
        self.write_csv(filepath, delimiter='\t')
    
    # Estatísticas sobre as variáveis
    def describe(self):
        return self.X.describe()
    
    # Contagens de valores nulos
    def count_missing_values(self):
        return self.X.isnull().sum()
    
    # Substituição de valores nulos por uma constante
    def replace_missing_values(self):
        imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        for column in self.X.columns:
            if self.X[column].dtype == 'object':
                imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            else:
                imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            self.X[column] = imputer.fit_transform(self.X[[column]])


Exemplo de aplicação com um Dataset titanic fornecido em DAA

In [14]:
titanic_data = Dataset()
titanic_data.set_label_name('Survived')
titanic_data.read_csv('titanic.csv')
X = titanic_data.get_X()
y = titanic_data.get_y()
feature_names = titanic_data.get_feature_names()
label_name = titanic_data.get_label_name()

  self.y = pd.Series()


In [15]:
print("Matriz X:")
print(X.head())

Matriz X:
   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


In [16]:
print("Vetor y:")
print(y.head())

Vetor y:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [17]:
print("Nomes das features:")
print(feature_names)

Nomes das features:
['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [18]:
print("Nome da label:")
print(label_name)

Nome da label:
Survived


In [19]:
print(titanic_data.describe())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean    446.000000    2.308642   29.699118    0.523008    0.381594   32.204208
std     257.353842    0.836071   14.526497    1.102743    0.806057   49.693429
min       1.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%     223.500000    2.000000   20.125000    0.000000    0.000000    7.910400
50%     446.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%     668.500000    3.000000   38.000000    1.000000    0.000000   31.000000
max     891.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [20]:
print(titanic_data.count_missing_values())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [21]:
titanic_data.replace_missing_values()

Verificar se foram substituidos os valores nulos

In [22]:
print(titanic_data.count_missing_values())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [23]:
titanic_data.write_csv('titanic_clean.csv')