In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from scipy.stats import skew

In [3]:

"""
Feature Engineering for rainfall trend analysis in Eastern Nepal.
This script loads preprocessed data, creates new features, transforms features,
encodes categorical variables, selects the best features, and optionally applies
dimensionality reduction.
"""



# Define file paths
PREPROCESSED_PATH = '../Data/Preprocessed'
OUTPUT_PATH = '../Outputs'

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

def load_data():
    """Load preprocessed training data."""
    try:
        data = pd.read_csv(os.path.join(PREPROCESSED_PATH, 'train_data.csv'))
        data['date'] = pd.to_datetime(data['date'])
        print("Preprocessed data loaded successfully.")
        return data
    except FileNotFoundError as e:
        raise FileNotFoundError(f"Error: {e}. Check if train_data.csv exists in {PREPROCESSED_PATH}")

def create_new_features(data):
    """Create new features for trend analysis."""
    print("\n--- Creating New Features ---")
    # Yearly total rainfall per station
    data['yearly_rainfall'] = data.groupby(['station_name_x', 'year'])['rainfall_sum'].transform('sum')
    
    # Monthly total rainfall per station
    data['monthly_rainfall'] = data.groupby(['station_name_x', 'year', 'month'])['rainfall_sum'].transform('sum')
    
    # Extreme rainfall indicator (>50 mm daily, typical threshold for heavy rain)
    data['extreme_rainfall'] = (data['rainfall_sum'] > 50).astype(int)
    
    # Previous day's rainfall (lagged feature)
    data = data.sort_values(['station_name_x', 'date'])
    data['prev_day_rainfall'] = data.groupby('station_name_x')['rainfall_sum'].shift(1)
    data['prev_day_rainfall'] = data['prev_day_rainfall'].fillna(0)
    
    # Rolling mean of rainfall (7-day window)
    data['rolling_mean_7d'] = data.groupby('station_name_x')['rainfall_sum'].rolling(window=7, min_periods=1).mean().reset_index(level=0, drop=True)
    
    # Day of year for seasonal patterns
    data['day_of_year'] = data['date'].dt.dayofyear
    
    print("New features created: yearly_rainfall, monthly_rainfall, extreme_rainfall, prev_day_rainfall, rolling_mean_7d, day_of_year")
    return data

def transform_features(data):
    """Transform numerical features to handle skewness and scale."""
    print("\n--- Transforming Features ---")
    numerical_cols = ['rainfall_sum', 'yearly_rainfall', 'monthly_rainfall', 'prev_day_rainfall', 'rolling_mean_7d', 'ele(meter)', 'lat(deg)', 'lon(deg)', 'day_of_year']
    
    # Log-transform skewed features
    for col in numerical_cols:
        if col in data.columns:
            # Check skewness
            skewness = skew(data[col].dropna())
            if skewness > 1:  # Highly positively skewed
                data[f'log_{col}'] = np.log1p(data[col])
                print(f"Log-transformed {col} (skewness: {skewness:.2f})")
    
    # Standardize numerical features
    scaler = StandardScaler()
    scaled_cols = [col for col in numerical_cols if col in data.columns] + [f'log_{col}' for col in numerical_cols if f'log_{col}' in data.columns]
    if scaled_cols:
        data[scaled_cols] = scaler.fit_transform(data[scaled_cols])
        print("Standardized numerical features:", scaled_cols)
    
    return data

def encode_categorical_features(data):
    """Encode categorical features."""
    print("\n--- Encoding Categorical Features ---")
    categorical_cols = ['station_name_x', 'district']
    encoders = {}
    
    for col in categorical_cols:
        if col in data.columns:
            le = LabelEncoder()
            data[f'{col}_encoded'] = le.fit_transform(data[col])
            encoders[col] = le
            print(f"Encoded {col} into {col}_encoded")
    
    # Save encoders for future use
    import pickle
    with open(os.path.join(OUTPUT_PATH, 'label_encoders.pkl'), 'wb') as f:
        pickle.dump(encoders, f)
    
    return data

def select_best_features(data, target='rainfall_sum', k=10):
    """Select the best features using statistical tests."""
    print("\n--- Selecting Best Features ---")
    # Features to consider (exclude target and non-numeric columns)
    feature_cols = [col for col in data.columns if col not in [target, 'date', 'station_name_x', 'district'] and data[col].dtype in ['int64', 'float64']]
    
    if not feature_cols:
        print("No valid features for selection.")
        return data, []
    
    X = data[feature_cols].fillna(0)
    y = data[target].fillna(0)
    
    # Use SelectKBest with f_regression
    selector = SelectKBest(score_func=f_regression, k=min(k, len(feature_cols)))
    selector.fit(X, y)
    
    # Get selected features
    selected_features = X.columns[selector.get_support()].tolist()
    scores = pd.DataFrame({'Feature': feature_cols, 'Score': selector.scores_})
    scores = scores.sort_values(by='Score', ascending=False)
    
    print("Selected features:", selected_features)
    print("\nFeature scores:\n", scores)
    
    # Save feature scores
    scores.to_csv(os.path.join(OUTPUT_PATH, 'feature_scores.csv'), index=False)
    
    return data, selected_features

def dimensionality_reduction(data, selected_features, n_components=3):
    """Apply PCA for dimensionality reduction (optional)."""
    print("\n--- Dimensionality Reduction (PCA) ---")
    if not selected_features:
        print("No features selected for PCA.")
        return data
    
    X = data[selected_features].fillna(0)
    
    # Apply PCA
    pca = PCA(n_components=min(n_components, len(selected_features)))
    pca_result = pca.fit_transform(X)
    
    # Add PCA components to data
    for i in range(pca_result.shape[1]):
        data[f'pca_component_{i+1}'] = pca_result[:, i]
    
    # Save explained variance
    explained_variance = pca.explained_variance_ratio_
    print(f"PCA explained variance ratios: {explained_variance}")
    with open(os.path.join(OUTPUT_PATH, 'pca_results.txt'), 'w') as f:
        f.write(f"Explained Variance Ratios: {explained_variance}\n")
        f.write(f"Total Explained Variance: {sum(explained_variance):.4f}\n")
    
    return data

def save_data(data):
    """Save feature-engineered data."""
    output_file = os.path.join(PREPROCESSED_PATH, 'feature_engineered_data.csv')
    data.to_csv(output_file, index=False)
    print(f"Feature-engineered data saved to {output_file}")

def main():
    """Main function to execute feature engineering steps."""
    # Load data
    data = load_data()
    
    # Create new features
    data = create_new_features(data)
    
    # Transform features
    data = transform_features(data)
    
    # Encode categorical features
    data = encode_categorical_features(data)
    
    # Select best features
    data, selected_features = select_best_features(data)
    
    # Dimensionality reduction (optional)
    data = dimensionality_reduction(data, selected_features)
    
    # Save feature-engineered data
    save_data(data)
    
    print("Feature engineering completed successfully.")

if __name__ == "__main__":
    main()

Preprocessed data loaded successfully.

--- Creating New Features ---
New features created: yearly_rainfall, monthly_rainfall, extreme_rainfall, prev_day_rainfall, rolling_mean_7d, day_of_year

--- Transforming Features ---
Log-transformed rainfall_sum (skewness: 5.89)
Log-transformed monthly_rainfall (skewness: 2.04)
Log-transformed prev_day_rainfall (skewness: 5.89)
Log-transformed rolling_mean_7d (skewness: 3.34)
Standardized numerical features: ['rainfall_sum', 'yearly_rainfall', 'monthly_rainfall', 'prev_day_rainfall', 'rolling_mean_7d', 'ele(meter)', 'lat(deg)', 'lon(deg)', 'day_of_year', 'log_rainfall_sum', 'log_monthly_rainfall', 'log_prev_day_rainfall', 'log_rolling_mean_7d']

--- Encoding Categorical Features ---
Encoded station_name_x into station_name_x_encoded

--- Selecting Best Features ---
Selected features: ['gsid', 'yearly_rainfall', 'monthly_rainfall', 'extreme_rainfall', 'prev_day_rainfall', 'rolling_mean_7d', 'log_rainfall_sum', 'log_monthly_rainfall', 'log_prev_da