In [1]:
import numpy as np
import pandas as pd

class VarianceThreshold:
    def __init__(self, threshold=0.0):
        self.threshold = threshold
        self.variances_ = None
        self.selected_features_ = None

    def fit(self, X):
        # Transforma as features categóricas em numéricas utilizando a codificação one-hot
        # Cria uma matriz de features numéricas concatenando as features numéricas e as features codificadas one-hot
        num_features = X.select_dtypes(include=np.number)
        cat_features = X.select_dtypes(include='object')
        cat_features_encoded = pd.get_dummies(cat_features)
        X_encoded = pd.concat([num_features, cat_features_encoded], axis=1)
        
        # Calcula a variância de cada feature numérica
        self.variances_ = np.var(X_encoded, axis=0)
        return self

    def transform(self, X):
        # Transforma as features categóricas em numéricas utilizando a codificação one-hot
        # Cria uma matriz de features numéricas concatenando as features numéricas e as features codificadas one-hot
        num_features = X.select_dtypes(include=np.number)
        cat_features = X.select_dtypes(include='object')
        cat_features_encoded = pd.get_dummies(cat_features)
        X_encoded = pd.concat([num_features, cat_features_encoded], axis=1)
        
        # Seleciona as features com variância superior ao limite especificado
        self.selected_features_ = np.where(self.variances_ > self.threshold)[0]
        X_selected = X_encoded.iloc[:, self.selected_features_]
        return X_selected

    def fit_transform(self, X):
        self.fit(X)
        X_selected = self.transform(X)
        return X_selected

In [3]:
# Carrega o arquivo CSV para um DataFrame
df = pd.read_csv('titanic.csv')

# Instancia o objeto VarianceThreshold
selector = VarianceThreshold(threshold=0.2)

# Aplica o método fit_transform para selecionar as features com variância superior a 0.2
X_selected = selector.fit_transform(df)

# Exibe as features selecionadas
print('Features selecionadas:')
print(X_selected.columns)

Features selecionadas:
Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_S'],
      dtype='object')
