# 🛠️ Engenharia e Seleção de Features

Este notebook demonstra:
- Seleção de features com `SelectKBest`
- Transformações com `StandardScaler` e `OneHotEncoder`
- Criação de novas features
- Extração de features com PCA


In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

# Dataset sintético
X, y = make_classification(n_samples=200, n_features=10, n_informative=5, random_state=42)
df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
df['target'] = y
df.head()


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target
0,-0.346347,0.615105,-1.599136,-0.025077,-1.835385,-0.071393,-0.0799,-0.088343,2.149471,-0.406723,1
1,1.353891,-2.552883,0.56995,-0.83809,-1.517482,0.915475,0.464008,0.635397,-0.537256,0.721466,0
2,0.136336,2.473938,2.155743,-0.054096,-1.777367,-1.255051,-1.21136,0.836324,5.026974,-1.131453,1
3,-1.471314,-1.610811,1.811375,-1.426168,-0.222879,-1.767294,1.903552,0.328652,-1.092684,0.623549,0
4,0.767585,0.107784,-0.353016,-1.634055,-3.142936,-1.774286,-0.370667,1.313069,4.881878,-1.036576,1


In [2]:
from sklearn.feature_selection import SelectKBest, f_classif

X = df.drop("target", axis=1)
y = df["target"]

selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

# Mostrar colunas selecionadas
selected_columns = X.columns[selector.get_support()]
print("Features selecionadas:", list(selected_columns))


Features selecionadas: ['feature_0', 'feature_3', 'feature_4', 'feature_5', 'feature_6']


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Normalizar features selecionadas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_columns])

# Simulando feature categórica
df['categoria'] = np.random.choice(['A', 'B', 'C'], size=len(df))

# One-hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['categoria']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['categoria']))

# Concatenar no dataset
df_encoded = pd.concat([df, encoded_df], axis=1)
df_encoded.drop(columns=['categoria'], inplace=True)
df_encoded.head()


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target,categoria_A,categoria_B,categoria_C
0,-0.346347,0.615105,-1.599136,-0.025077,-1.835385,-0.071393,-0.0799,-0.088343,2.149471,-0.406723,1,0.0,1.0,0.0
1,1.353891,-2.552883,0.56995,-0.83809,-1.517482,0.915475,0.464008,0.635397,-0.537256,0.721466,0,1.0,0.0,0.0
2,0.136336,2.473938,2.155743,-0.054096,-1.777367,-1.255051,-1.21136,0.836324,5.026974,-1.131453,1,0.0,0.0,1.0
3,-1.471314,-1.610811,1.811375,-1.426168,-0.222879,-1.767294,1.903552,0.328652,-1.092684,0.623549,0,1.0,0.0,0.0
4,0.767585,0.107784,-0.353016,-1.634055,-3.142936,-1.774286,-0.370667,1.313069,4.881878,-1.036576,1,1.0,0.0,0.0


In [4]:
# Nova feature: combinação de duas variáveis
df_encoded['nova_feature'] = df_encoded['feature_0'] * df_encoded['feature_1']
df_encoded[['feature_0', 'feature_1', 'nova_feature']].head()


Unnamed: 0,feature_0,feature_1,nova_feature
0,-0.346347,0.615105,-0.21304
1,1.353891,-2.552883,-3.456326
2,0.136336,2.473938,0.337287
3,-1.471314,-1.610811,2.370009
4,0.767585,0.107784,0.082734


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

X_final = df_encoded.drop('target', axis=1)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_final)

plt.figure(figsize=(6, 4))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df_encoded['target'], cmap='coolwarm')
plt.title("PCA - Redução de Dimensionalidade")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
