# Economic Downturn Detector: Feature Engineering

This notebook focuses on preparing the data for modeling by handling missing values, creating lag variables, and normalizing the features.

In [None]:
# Import notebook utilities
from notebook_utils import (
    # Setup functions
    setup_notebook, load_data, display_data_info, save_figure,
    
    # Import from econ_downturn package
    engineer_features, normalize_data, apply_pca
)

# Import other libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# Set up the notebook environment
setup_notebook()

## 1. Load the Merged Dataset

First, let's load the merged dataset created in the data exploration notebook.

In [None]:
# Load all data using the utility function
merged_data = load_data(use_cached=True)

# Display information about the dataset
display_data_info(merged_data)

## 2. Feature Engineering

Let's perform feature engineering on the dataset to prepare it for modeling.

In [None]:
# Engineer features using the package function
data_with_features = engineer_features(merged_data)

print(f"Data shape after feature engineering: {data_with_features.shape}")
print(f"Number of features: {data_with_features.shape[1]}")

# Display the first few rows of the engineered data
display(data_with_features.head())

## 3. Normalize the Data

Let's normalize the features to ensure they are on the same scale for modeling.

In [None]:
# Normalize the data
data_normalized, scaler = normalize_data(data_with_features)

print(f"Data shape after normalization: {data_normalized.shape}")

# Display the first few rows of the normalized data
display(data_normalized.head())

## 4. Apply Principal Component Analysis (PCA)

Let's apply PCA to reduce dimensionality and handle multicollinearity.

In [None]:
# Separate features and target
X = data_normalized.drop(columns=['recession'])
y = data_normalized['recession']

# Apply PCA
X_pca, pca, explained_variance = apply_pca(X, n_components=0.95)

# Create a DataFrame with PCA components
pca_cols = [f'PC{i+1}' for i in range(X_pca.shape[1])]
X_pca_df = pd.DataFrame(X_pca, columns=pca_cols, index=X.index)

# Add back the recession indicator
X_pca_df['recession'] = y

print(f"Data shape after PCA: {X_pca_df.shape}")
print(f"Number of PCA components: {X_pca.shape[1]}")
print(f"Cumulative explained variance: {explained_variance:.4f}")

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_), 'r-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Principal Components')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Save the figure
save_figure(plt.gcf(), "pca_explained_variance.png")

## 5. Save the Processed Data

Let's save the processed datasets for modeling.

In [None]:
# Get output paths
from econ_downturn import get_output_paths
output_paths = get_output_paths()
output_dir = output_paths['data_dir']

# Save the dataset with features
data_path = os.path.join(output_dir, 'data_with_features.csv')
data_with_features.to_csv(data_path)
print(f"Saved dataset with features to {data_path}")

# Save the normalized dataset
normalized_path = os.path.join(output_dir, 'data_normalized.csv')
data_normalized.to_csv(normalized_path)
print(f"Saved normalized dataset to {normalized_path}")

# Save the PCA dataset
pca_path = os.path.join(output_dir, 'data_pca.csv')
X_pca_df.to_csv(pca_path)
print(f"Saved PCA dataset to {pca_path}")

## 6. Next Steps

Based on the feature engineering, the next steps would be:

1. Apply Multiple Discriminant Analysis (MDA) to identify the most significant predictors of recessions
2. Evaluate the model's performance in classifying recessionary and non-recessionary periods
3. Interpret the results and identify the most important economic indicators for predicting recessions