# Feature Engineering
## Masters in AI & ML Project

This notebook focuses on feature creation, selection, and transformation.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

print('âœ“ Libraries imported successfully')

## 1. Load Processed Data

In [None]:
# Load your dataset
data_path = '../data/raw/your_dataset.csv'

# For demonstration
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=15, n_informative=10,
                          n_redundant=3, n_repeated=2, random_state=42)
df = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(15)])
df['Target'] = y

print(f'Dataset loaded: {df.shape}')
df.head()

## 2. Create New Features

In [None]:
# Feature Engineering Examples
# Customize based on your domain and data

# Example 1: Interaction features
df['Feature_1x2'] = df['Feature_1'] * df['Feature_2']

# Example 2: Polynomial features
df['Feature_1_squared'] = df['Feature_1'] ** 2

# Example 3: Ratio features
df['Feature_1_2_ratio'] = df['Feature_1'] / (df['Feature_2'] + 1e-6)

# Example 4: Binning continuous features
df['Feature_1_binned'] = pd.cut(df['Feature_1'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

print(f'\nâœ“ New features created. Dataset shape: {df.shape}')
print(f'\nNew columns: {[col for col in df.columns if "x" in col or "squared" in col or "ratio" in col or "binned" in col]}')

## 3. Feature Selection

In [None]:
# Prepare data for feature selection
target_col = 'Target'
X = df.select_dtypes(include=[np.number]).drop(columns=[target_col])
y = df[target_col]

print(f'Features for selection: {X.shape[1]}')
print(f'Target distribution: {y.value_counts().to_dict()}')

In [None]:
# Method 1: Statistical Test (ANOVA F-value)
selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)

# Get feature scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': selector.scores_
}).sort_values('F-Score', ascending=False)

print('\nðŸ“Š Top 10 Features by F-Score:')
print(feature_scores.head(10))

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(feature_scores['Feature'].head(10), feature_scores['F-Score'].head(10))
plt.xlabel('F-Score')
plt.title('Top 10 Features by ANOVA F-Score')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Method 2: Mutual Information
mi_scores = mutual_info_classif(X, y, random_state=42)

mi_feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'MI-Score': mi_scores
}).sort_values('MI-Score', ascending=False)

print('\nðŸ“Š Top 10 Features by Mutual Information:')
print(mi_feature_scores.head(10))

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(mi_feature_scores['Feature'].head(10), mi_feature_scores['MI-Score'].head(10))
plt.xlabel('Mutual Information Score')
plt.title('Top 10 Features by Mutual Information')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 4. Feature Scaling

In [None]:
# StandardScaler (zero mean, unit variance)
scaler_standard = StandardScaler()
X_scaled_standard = scaler_standard.fit_transform(X)

# MinMaxScaler (0-1 range)
scaler_minmax = MinMaxScaler()
X_scaled_minmax = scaler_minmax.fit_transform(X)

# Compare scaling methods
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Original
axes[0].boxplot(X.iloc[:, :5].values)
axes[0].set_title('Original Features')
axes[0].set_xticklabels(X.columns[:5], rotation=45)

# StandardScaler
axes[1].boxplot(X_scaled_standard[:, :5])
axes[1].set_title('StandardScaler')
axes[1].set_xticklabels(X.columns[:5], rotation=45)

# MinMaxScaler
axes[2].boxplot(X_scaled_minmax[:, :5])
axes[2].set_title('MinMaxScaler')
axes[2].set_xticklabels(X.columns[:5], rotation=45)

plt.tight_layout()
plt.show()

print('\nâœ“ Feature scaling completed')

## 5. Dimensionality Reduction (PCA)

In [None]:
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled_standard)

# Explained variance
explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
ax1.plot(range(1, len(explained_var) + 1), explained_var, 'bo-')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Scree Plot')
ax1.grid(alpha=0.3)

# Cumulative variance
ax2.plot(range(1, len(cumulative_var) + 1), cumulative_var, 'ro-')
ax2.axhline(y=0.95, color='g', linestyle='--', label='95% Variance')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Explained Variance')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Find number of components for 95% variance
n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
print(f'\nðŸ“Š Components needed for 95% variance: {n_components_95} out of {len(explained_var)}')

In [None]:
# Visualize first two principal components
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.xlabel(f'PC1 ({explained_var[0]:.2%} variance)')
plt.ylabel(f'PC2 ({explained_var[1]:.2%} variance)')
plt.title('First Two Principal Components')
plt.colorbar(scatter, label='Target')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Save Processed Features

In [None]:
# Create final feature set
# Select top features based on your analysis
top_features = feature_scores['Feature'].head(10).tolist()

df_final = df[top_features + [target_col]].copy()

print(f'\nâœ“ Final dataset shape: {df_final.shape}')
print(f'\nSelected features: {top_features}')

# Save to processed directory
output_path = '../data/processed/processed_features.csv'
df_final.to_csv(output_path, index=False)
print(f'\nâœ“ Processed features saved to: {output_path}')

## 7. Summary

In [None]:
print('\n' + '='*60)
print('FEATURE ENGINEERING SUMMARY')
print('='*60)
print(f'\nðŸ“Š Original Features: {X.shape[1]}')
print(f'ðŸŽ¯ Selected Features: {len(top_features)}')
print(f'ðŸ“‰ Dimensionality Reduction: {n_components_95} components for 95% variance')

print('\n' + '='*60)
print('NEXT STEPS')
print('='*60)
print('1. Use processed features for model training')
print('2. Experiment with different feature combinations')
print('3. Consider domain-specific feature engineering')
print('4. Test model performance with selected features')
print('\nâœ“ Feature Engineering Complete!')