# Feature Selection Analysis for Greenhouse Gas Emissions Prediction

This notebook analyzes and identifies the best features for predicting Scope 1 and Scope 2 greenhouse gas emissions using various feature selection techniques.

## Objectives:
- Load and engineer features from the FitchGroup Codeathon dataset
- Apply multiple feature selection methods
- Compare and rank features by importance
- Optimize feature sets for model performance

## 1. Import Required Libraries

In [1]:
# Import necessary libraries for data analysis and feature selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning and Feature Selection
from sklearn.feature_selection import (
    SelectKBest, f_regression, mutual_info_regression,
    RFE, RFECV, VarianceThreshold
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load and Engineer Features from Dataset

In [2]:
# Load all datasets
print("Loading datasets...")
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sector_df = pd.read_csv('data/revenue_distribution_by_sector.csv')
env_activities_df = pd.read_csv('data/environmental_activities.csv')
sdg_df = pd.read_csv('data/sustainable_development_goals.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sector data shape: {sector_df.shape}")
print(f"Environmental activities shape: {env_activities_df.shape}")
print(f"SDG data shape: {sdg_df.shape}")

# Display basic info about the target variables
print("\nTarget Variables Statistics:")
print(f"Scope 1 emissions - Mean: {train_df['target_scope_1'].mean():.2f}, Std: {train_df['target_scope_1'].std():.2f}")
print(f"Scope 2 emissions - Mean: {train_df['target_scope_2'].mean():.2f}, Std: {train_df['target_scope_2'].std():.2f}")
print(f"Zero Scope 2 values: {(train_df['target_scope_2'] == 0).sum()}/{len(train_df)} ({(train_df['target_scope_2'] == 0).mean()*100:.1f}%)")

Loading datasets...
Training data shape: (429, 12)
Test data shape: (49, 10)
Sector data shape: (799, 6)
Environmental activities shape: (355, 4)
SDG data shape: (165, 3)

Target Variables Statistics:
Scope 1 emissions - Mean: 55745.65, Std: 110535.54
Scope 2 emissions - Mean: 57434.75, Std: 177116.18
Zero Scope 2 values: 13/429 (3.0%)


In [3]:
# Comprehensive Feature Engineering
def create_comprehensive_features(train_data, sector_data, env_data, sdg_data):
    """Create a comprehensive set of engineered features"""
    
    # Start with base features
    features = train_data.copy()
    
    # 1. Geographic Features (One-hot encoding)
    region_dummies = pd.get_dummies(features['region_code'], prefix='region')
    features = pd.concat([features, region_dummies], axis=1)
    
    # Country diversity (simplified)
    country_dummies = pd.get_dummies(features['country_code'], prefix='country')
    features = pd.concat([features, country_dummies], axis=1)
    
    # 2. Revenue-based features
    features['log_revenue'] = np.log1p(features['revenue'])
    features['revenue_millions'] = features['revenue'] / 1e6
    features['revenue_squared'] = features['revenue'] ** 2
    
    # 3. Sustainability score interactions
    features['env_gov_interaction'] = features['environmental_score'] * features['governance_score']
    features['overall_env_ratio'] = features['overall_score'] / features['environmental_score']
    features['weighted_sustainability'] = (
        0.45 * features['environmental_score'] + 
        0.30 * features['social_score'] + 
        0.25 * features['governance_score']
    )
    
    # 4. Sector-based features
    sector_pivot = sector_data.pivot_table(
        values='revenue_pct',
        index='entity_id',
        columns='nace_level_1_code',
        aggfunc='sum',
        fill_value=0
    ).add_prefix('sector_')
    
    # Sector diversity metrics
    sector_counts = sector_data.groupby('entity_id').size().rename('sector_diversity')
    sector_max_pct = sector_data.groupby('entity_id')['revenue_pct'].max().rename('max_sector_concentration')
    sector_entropy = sector_data.groupby('entity_id')['revenue_pct'].apply(
        lambda x: -np.sum(x * np.log(x + 1e-10))
    ).rename('sector_entropy')
    
    # 5. Environmental activities features
    env_agg = env_data.groupby('entity_id').agg({
        'env_score_adjustment': ['sum', 'mean', 'count', 'std']
    }).fillna(0)
    env_agg.columns = ['env_adj_sum', 'env_adj_mean', 'env_activities_count', 'env_adj_std']
    
    # Environmental activity types
    env_types = env_data.groupby('entity_id')['activity_type'].nunique().rename('env_activity_types')
    
    # 6. SDG features
    sdg_agg = sdg_data.groupby('entity_id').agg({
        'sdg_id': ['count', 'nunique']
    }).fillna(0)
    sdg_agg.columns = ['sdg_commitments', 'unique_sdgs']
    
    # Climate-related SDGs (6, 7, 13, 14, 15)
    climate_sdgs = sdg_data[sdg_data['sdg_id'].isin([6, 7, 13, 14, 15])]
    climate_sdg_count = climate_sdgs.groupby('entity_id').size().rename('climate_sdg_count')
    
    # Merge all features
    features = features.merge(sector_pivot, left_on='entity_id', right_index=True, how='left')
    features = features.merge(sector_counts, left_on='entity_id', right_index=True, how='left')
    features = features.merge(sector_max_pct, left_on='entity_id', right_index=True, how='left')
    features = features.merge(sector_entropy, left_on='entity_id', right_index=True, how='left')
    features = features.merge(env_agg, left_on='entity_id', right_index=True, how='left')
    features = features.merge(env_types, left_on='entity_id', right_index=True, how='left')
    features = features.merge(sdg_agg, left_on='entity_id', right_index=True, how='left')
    features = features.merge(climate_sdg_count, left_on='entity_id', right_index=True, how='left')
    
    # Fill missing values
    features = features.fillna(0)
    
    return features

# Create comprehensive feature set
print("Creating comprehensive feature set...")
feature_data = create_comprehensive_features(train_df, sector_df, env_activities_df, sdg_df)

print(f"Total features created: {feature_data.shape[1]}")
print(f"Feature columns: {len([col for col in feature_data.columns if col not in ['entity_id', 'target_scope_1', 'target_scope_2']])}")

# Display feature summary
print("\nFeature categories created:")
print(f"- Geographic features: {len([col for col in feature_data.columns if col.startswith(('region_', 'country_'))])}")
print(f"- Sector features: {len([col for col in feature_data.columns if col.startswith('sector_')])}")
print(f"- Environmental features: {len([col for col in feature_data.columns if col.startswith('env_')])}")
print(f"- SDG features: {len([col for col in feature_data.columns if col.startswith(('sdg_', 'climate_'))])}")
print(f"- Revenue features: {len([col for col in feature_data.columns if 'revenue' in col])}")
print(f"- Sustainability features: {len([col for col in feature_data.columns if any(x in col for x in ['score', 'sustainability'])])}")

Creating comprehensive feature set...
Total features created: 84
Feature columns: 81

Feature categories created:
- Geographic features: 39
- Sector features: 22
- Environmental features: 6
- SDG features: 2
- Revenue features: 4
- Sustainability features: 5


In [4]:
# Prepare feature matrix and targets
X = feature_data.drop(['entity_id', 'target_scope_1', 'target_scope_2', 'region_name', 'country_name'], axis=1, errors='ignore')
y_scope1 = feature_data['target_scope_1']
y_scope2 = feature_data['target_scope_2']

# Remove non-numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns
X = X[numeric_cols]

print(f"Final feature matrix shape: {X.shape}")
print(f"Features selected: {len(X.columns)}")

# Display first few feature names
print(f"\nFirst 20 features: {list(X.columns[:20])}")
print(f"Last 10 features: {list(X.columns[-10:])}")

Final feature matrix shape: (429, 42)
Features selected: 42

First 20 features: ['revenue', 'overall_score', 'environmental_score', 'social_score', 'governance_score', 'log_revenue', 'revenue_millions', 'revenue_squared', 'env_gov_interaction', 'overall_env_ratio', 'weighted_sustainability', 'sector_A', 'sector_B', 'sector_C', 'sector_D', 'sector_E', 'sector_F', 'sector_G', 'sector_H', 'sector_I']
Last 10 features: ['max_sector_concentration', 'sector_entropy', 'env_adj_sum', 'env_adj_mean', 'env_activities_count', 'env_adj_std', 'env_activity_types', 'sdg_commitments', 'unique_sdgs', 'climate_sdg_count']


## 3. Univariate Feature Selection

In [5]:
# Univariate Feature Selection using F-regression and Mutual Information
def perform_univariate_selection(X, y, target_name, k=20):
    """Perform univariate feature selection using multiple methods"""
    
    # F-regression test
    f_selector = SelectKBest(score_func=f_regression, k=k)
    X_f_selected = f_selector.fit_transform(X, y)
    f_scores = f_selector.scores_
    f_selected_features = X.columns[f_selector.get_support()]
    
    # Mutual Information
    mi_scores = mutual_info_regression(X, y, random_state=42)
    mi_ranking = np.argsort(mi_scores)[::-1]
    mi_selected_features = X.columns[mi_ranking[:k]]
    
    # Create results dataframe
    results = pd.DataFrame({
        'feature': X.columns,
        'f_score': f_scores,
        'mi_score': mi_scores,
        'f_selected': X.columns.isin(f_selected_features),
        'mi_selected': X.columns.isin(mi_selected_features)
    }).sort_values('f_score', ascending=False)
    
    return results, f_selected_features, mi_selected_features

# Apply univariate selection for both targets
print("Performing Univariate Feature Selection...")
print("\n" + "="*50)
print("SCOPE 1 EMISSIONS")
print("="*50)

univariate_scope1, f_features_s1, mi_features_s1 = perform_univariate_selection(X, y_scope1, "Scope 1", k=15)
print(f"Top 15 features by F-regression for Scope 1:")
print(univariate_scope1.head(15)[['feature', 'f_score']].to_string(index=False))

print(f"\nTop 15 features by Mutual Information for Scope 1:")
print(univariate_scope1.nlargest(15, 'mi_score')[['feature', 'mi_score']].to_string(index=False))

print("\n" + "="*50)
print("SCOPE 2 EMISSIONS")
print("="*50)

univariate_scope2, f_features_s2, mi_features_s2 = perform_univariate_selection(X, y_scope2, "Scope 2", k=15)
print(f"Top 15 features by F-regression for Scope 2:")
print(univariate_scope2.head(15)[['feature', 'f_score']].to_string(index=False))

print(f"\nTop 15 features by Mutual Information for Scope 2:")
print(univariate_scope2.nlargest(15, 'mi_score')[['feature', 'mi_score']].to_string(index=False))

Performing Univariate Feature Selection...

SCOPE 1 EMISSIONS
Top 15 features by F-regression for Scope 1:
                feature   f_score
            log_revenue 34.916179
               sector_C 21.959245
                revenue 15.810581
       revenue_millions 15.810581
               sector_J 15.627018
    environmental_score  6.208024
       governance_score  5.523411
               sector_B  5.295539
               sector_K  4.681352
               sector_A  4.050341
               sector_M  2.764434
               sector_D  2.375778
      overall_env_ratio  2.292593
weighted_sustainability  2.218115
               sector_R  2.217086

Top 15 features by Mutual Information for Scope 1:
                feature  mi_score
            log_revenue  0.148033
    environmental_score  0.131114
                revenue  0.079880
       revenue_millions  0.079879
               sector_J  0.078568
            env_adj_sum  0.066414
           env_adj_mean  0.056709
        revenue_squared  

## 4. Tree-Based Feature Importance

In [6]:
# Random Forest Feature Importance
def get_tree_importance(X, y, target_name, n_estimators=100):
    """Get feature importance using Random Forest"""
    
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1)
    rf.fit(X, y)
    
    # Get feature importance
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return importance_df, rf

print("Random Forest Feature Importance Analysis...")
print("\n" + "="*50)
print("SCOPE 1 EMISSIONS - TREE IMPORTANCE")
print("="*50)

rf_importance_s1, rf_model_s1 = get_tree_importance(X, y_scope1, "Scope 1")
print("Top 20 features by Random Forest importance for Scope 1:")
print(rf_importance_s1.head(20).to_string(index=False))

print("\n" + "="*50)
print("SCOPE 2 EMISSIONS - TREE IMPORTANCE")
print("="*50)

rf_importance_s2, rf_model_s2 = get_tree_importance(X, y_scope2, "Scope 2")
print("Top 20 features by Random Forest importance for Scope 2:")
print(rf_importance_s2.head(20).to_string(index=False))

# Cross-validation scores for Random Forest models
print("\n" + "="*30)
print("RANDOM FOREST MODEL PERFORMANCE")
print("="*30)

cv_scores_s1 = cross_val_score(rf_model_s1, X, y_scope1, cv=5, scoring='neg_mean_squared_error')
cv_scores_s2 = cross_val_score(rf_model_s2, X, y_scope2, cv=5, scoring='neg_mean_squared_error')

print(f"Scope 1 - CV RMSE: {np.sqrt(-cv_scores_s1.mean()):.2f} (¬±{np.sqrt(-cv_scores_s1).std():.2f})")
print(f"Scope 2 - CV RMSE: {np.sqrt(-cv_scores_s2.mean()):.2f} (¬±{np.sqrt(-cv_scores_s2).std():.2f})")

Random Forest Feature Importance Analysis...

SCOPE 1 EMISSIONS - TREE IMPORTANCE
Top 20 features by Random Forest importance for Scope 1:
                 feature  importance
     environmental_score    0.085806
         revenue_squared    0.084110
                 revenue    0.068648
             log_revenue    0.068187
                sector_C    0.066260
            social_score    0.060765
     env_gov_interaction    0.058677
        revenue_millions    0.056891
       overall_env_ratio    0.052417
                sector_B    0.048584
 weighted_sustainability    0.040829
        governance_score    0.037238
           overall_score    0.033129
                sector_D    0.030142
             env_adj_sum    0.024794
            env_adj_mean    0.023960
max_sector_concentration    0.020677
          sector_entropy    0.015358
                sector_I    0.014935
                sector_A    0.012718

SCOPE 2 EMISSIONS - TREE IMPORTANCE
Top 20 features by Random Forest importance for

## 5. Recursive Feature Elimination (RFE)

In [10]:
# Recursive Feature Elimination with Cross-Validation
def perform_rfe_analysis(X, y, target_name, n_features_to_select=15):
    """Perform RFE with both Linear and Random Forest estimators"""
    
    # RFE with Linear Regression
    lr_estimator = LinearRegression()
    rfe_lr = RFE(estimator=lr_estimator, n_features_to_select=n_features_to_select, step=1)
    rfe_lr.fit(X, y)
    
    # RFE with Random Forest
    rf_estimator = RandomForestRegressor(n_estimators=50, random_state=42)
    rfe_rf = RFE(estimator=rf_estimator, n_features_to_select=n_features_to_select, step=1)
    rfe_rf.fit(X, y)
    
    # RFECV with Random Forest for optimal number of features
    rfecv = RFECV(estimator=RandomForestRegressor(n_estimators=30, random_state=42), 
                  step=1, cv=5, scoring='neg_mean_squared_error', min_features_to_select=5)
    rfecv.fit(X, y)
    
    # Get selected features
    rfe_lr_features = X.columns[rfe_lr.support_]
    rfe_rf_features = X.columns[rfe_rf.support_]
    rfecv_features = X.columns[rfecv.support_]
    
    # Get rankings
    rfe_lr_ranking = pd.DataFrame({
        'feature': X.columns,
        'ranking': rfe_lr.ranking_,
        'selected': rfe_lr.support_
    }).sort_values('ranking')
    
    rfe_rf_ranking = pd.DataFrame({
        'feature': X.columns,
        'ranking': rfe_rf.ranking_,
        'selected': rfe_rf.support_
    }).sort_values('ranking')
    
    return (rfe_lr_features, rfe_rf_features, rfecv_features, 
            rfe_lr_ranking, rfe_rf_ranking, rfecv)

print("Recursive Feature Elimination Analysis...")
print("\n" + "="*50)
print("SCOPE 1 EMISSIONS - RFE ANALYSIS")
print("="*50)

(rfe_lr_s1, rfe_rf_s1, rfecv_s1, lr_rank_s1, rf_rank_s1, rfecv_model_s1) = perform_rfe_analysis(X, y_scope1, "Scope 1")

print(f"RFE with Linear Regression - Selected features ({len(rfe_lr_s1)}):")
print(list(rfe_lr_s1))

print(f"\nRFE with Random Forest - Selected features ({len(rfe_rf_s1)}):")
print(list(rfe_rf_s1))

print(f"\nRFECV with Random Forest - Optimal features ({len(rfecv_s1)}):")
print(f"Optimal number of features: {rfecv_model_s1.n_features_}")
# print(f"Best CV score: {max(rfecv_model_s1.grid_scores_):.2f}")
print(list(rfecv_s1))

print("\n" + "="*50)
print("SCOPE 2 EMISSIONS - RFE ANALYSIS")
print("="*50)

(rfe_lr_s2, rfe_rf_s2, rfecv_s2, lr_rank_s2, rf_rank_s2, rfecv_model_s2) = perform_rfe_analysis(X, y_scope2, "Scope 2")

print(f"RFE with Linear Regression - Selected features ({len(rfe_lr_s2)}):")
print(list(rfe_lr_s2))

print(f"\nRFE with Random Forest - Selected features ({len(rfe_rf_s2)}):")
print(list(rfe_rf_s2))

print(f"\nRFECV with Random Forest - Optimal features ({len(rfecv_s2)}):")
print(f"Optimal number of features: {rfecv_model_s2.n_features_}")
# print(f"Best CV score: {max(rfecv_model_s2.grid_scores_):.2f}")
print(list(rfecv_s2))

Recursive Feature Elimination Analysis...

SCOPE 1 EMISSIONS - RFE ANALYSIS
RFE with Linear Regression - Selected features (15):
['revenue', 'revenue_squared', 'env_gov_interaction', 'overall_env_ratio', 'sector_C', 'sector_G', 'sector_K', 'sector_M', 'sector_Q', 'sector_diversity', 'max_sector_concentration', 'sector_entropy', 'env_activities_count', 'sdg_commitments', 'unique_sdgs']

RFE with Random Forest - Selected features (15):
['revenue', 'overall_score', 'environmental_score', 'social_score', 'governance_score', 'log_revenue', 'revenue_millions', 'revenue_squared', 'env_gov_interaction', 'overall_env_ratio', 'weighted_sustainability', 'sector_B', 'sector_C', 'sector_entropy', 'env_adj_sum']

RFECV with Random Forest - Optimal features (30):
Optimal number of features: 30
['revenue', 'overall_score', 'environmental_score', 'social_score', 'governance_score', 'log_revenue', 'revenue_millions', 'revenue_squared', 'env_gov_interaction', 'overall_env_ratio', 'weighted_sustainability

## 6. Correlation Analysis and Multicollinearity Detection

In [11]:
# Correlation Analysis
def analyze_correlations(X, y_scope1, y_scope2, correlation_threshold=0.8):
    """Analyze feature correlations and identify highly correlated features"""
    
    # Calculate correlation matrix
    corr_matrix = X.corr()
    
    # Find highly correlated features
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > correlation_threshold:
                high_corr_pairs.append({
                    'feature1': corr_matrix.columns[i],
                    'feature2': corr_matrix.columns[j],
                    'correlation': corr_matrix.iloc[i, j]
                })
    
    # Target correlations
    target_corr_s1 = X.corrwith(y_scope1).sort_values(ascending=False)
    target_corr_s2 = X.corrwith(y_scope2).sort_values(ascending=False)
    
    # Remove NaN correlations
    target_corr_s1 = target_corr_s1.dropna()
    target_corr_s2 = target_corr_s2.dropna()
    
    return corr_matrix, high_corr_pairs, target_corr_s1, target_corr_s2

print("Correlation Analysis...")
print("\n" + "="*60)
print("CORRELATION ANALYSIS")
print("="*60)

corr_matrix, high_corr_pairs, target_corr_s1, target_corr_s2 = analyze_correlations(X, y_scope1, y_scope2)

print(f"Highly correlated feature pairs (|r| > 0.8): {len(high_corr_pairs)}")
if len(high_corr_pairs) > 0:
    print("\nTop 10 highly correlated pairs:")
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', key=abs, ascending=False)
    print(high_corr_df.head(10).to_string(index=False))

print(f"\n" + "="*30)
print("TARGET CORRELATIONS")
print("="*30)

print(f"\nTop 15 features correlated with Scope 1 emissions:")
print("Feature | Correlation")
print("-" * 40)
for feature, corr in target_corr_s1.head(15).items():
    print(f"{feature[:30]:<30} | {corr:8.4f}")

print(f"\nTop 15 features correlated with Scope 2 emissions:")
print("Feature | Correlation")
print("-" * 40)
for feature, corr in target_corr_s2.head(15).items():
    print(f"{feature[:30]:<30} | {corr:8.4f}")

# Variance Threshold - remove low variance features
print(f"\n" + "="*30)
print("LOW VARIANCE FEATURE REMOVAL")
print("="*30)

variance_threshold = VarianceThreshold(threshold=0.01)
X_variance_filtered = variance_threshold.fit_transform(X)
removed_features = X.columns[~variance_threshold.get_support()]

print(f"Features removed due to low variance (< 0.01): {len(removed_features)}")
if len(removed_features) > 0:
    print(f"Removed features: {list(removed_features)}")

print(f"Features remaining after variance filtering: {X_variance_filtered.shape[1]}")

Correlation Analysis...

CORRELATION ANALYSIS
Highly correlated feature pairs (|r| > 0.8): 11

Top 10 highly correlated pairs:
                feature1                feature2  correlation
                 revenue        revenue_millions     1.000000
         sdg_commitments             unique_sdgs     1.000000
           overall_score weighted_sustainability     0.999998
max_sector_concentration          sector_entropy    -0.960845
    env_activities_count      env_activity_types     0.935654
        sector_diversity          sector_entropy     0.900560
             env_adj_sum            env_adj_mean     0.892121
                 revenue         revenue_squared     0.850544
        revenue_millions         revenue_squared     0.850544
     environmental_score weighted_sustainability     0.824911

TARGET CORRELATIONS

Top 15 features correlated with Scope 1 emissions:
Feature | Correlation
----------------------------------------
log_revenue                    |   0.2749
sector_C     

## 7. L1/L2 Regularization Feature Selection

In [12]:
# L1 (Lasso) Regularization for Feature Selection
def lasso_feature_selection(X, y, target_name, alpha_values=[0.1, 0.5, 1.0, 5.0, 10.0]):
    """Use Lasso regression for feature selection"""
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    results = []
    
    for alpha in alpha_values:
        lasso = Lasso(alpha=alpha, random_state=42)
        lasso.fit(X_scaled, y)
        
        # Get non-zero coefficients (selected features)
        selected_features = X.columns[lasso.coef_ != 0]
        n_features = len(selected_features)
        
        # Cross-validation score
        cv_score = cross_val_score(lasso, X_scaled, y, cv=5, scoring='neg_mean_squared_error').mean()
        
        results.append({
            'alpha': alpha,
            'n_features': n_features,
            'cv_rmse': np.sqrt(-cv_score),
            'selected_features': selected_features,
            'coefficients': lasso.coef_[lasso.coef_ != 0]
        })
    
    return results

print("L1 (Lasso) Regularization Feature Selection...")
print("\n" + "="*50)
print("SCOPE 1 EMISSIONS - LASSO SELECTION")
print("="*50)

lasso_results_s1 = lasso_feature_selection(X, y_scope1, "Scope 1")

print("Alpha | Features | CV RMSE")
print("-" * 30)
for result in lasso_results_s1:
    print(f"{result['alpha']:5.1f} | {result['n_features']:8d} | {result['cv_rmse']:7.2f}")

# Best alpha based on CV score
best_alpha_s1 = min(lasso_results_s1, key=lambda x: x['cv_rmse'])
print(f"\nBest alpha for Scope 1: {best_alpha_s1['alpha']} (RMSE: {best_alpha_s1['cv_rmse']:.2f})")
print(f"Selected features ({len(best_alpha_s1['selected_features'])}):")
for i, (feature, coef) in enumerate(zip(best_alpha_s1['selected_features'], best_alpha_s1['coefficients'])):
    print(f"{i+1:2d}. {feature[:35]:<35} (coef: {coef:8.4f})")

print("\n" + "="*50)
print("SCOPE 2 EMISSIONS - LASSO SELECTION")
print("="*50)

lasso_results_s2 = lasso_feature_selection(X, y_scope2, "Scope 2")

print("Alpha | Features | CV RMSE")
print("-" * 30)
for result in lasso_results_s2:
    print(f"{result['alpha']:5.1f} | {result['n_features']:8d} | {result['cv_rmse']:7.2f}")

# Best alpha based on CV score
best_alpha_s2 = min(lasso_results_s2, key=lambda x: x['cv_rmse'])
print(f"\nBest alpha for Scope 2: {best_alpha_s2['alpha']} (RMSE: {best_alpha_s2['cv_rmse']:.2f})")
print(f"Selected features ({len(best_alpha_s2['selected_features'])}):")
for i, (feature, coef) in enumerate(zip(best_alpha_s2['selected_features'], best_alpha_s2['coefficients'])):
    print(f"{i+1:2d}. {feature[:35]:<35} (coef: {coef:8.4f})")

L1 (Lasso) Regularization Feature Selection...

SCOPE 1 EMISSIONS - LASSO SELECTION
Alpha | Features | CV RMSE
------------------------------
  0.1 |       42 | 249230.80
  0.5 |       42 | 249198.34
  1.0 |       42 | 249157.97
  5.0 |       42 | 248823.44
 10.0 |       40 | 247940.94

Best alpha for Scope 1: 10.0 (RMSE: 247940.94)
Selected features (40):
 1. revenue                             (coef: 28022.3750)
 2. overall_score                       (coef: -12532.2274)
 3. environmental_score                 (coef: -5943.1909)
 4. social_score                        (coef: 26574.7940)
 5. governance_score                    (coef: -14967.4568)
 6. log_revenue                         (coef: 26658.0531)
 7. revenue_millions                    (coef: -2120.2085)
 8. revenue_squared                     (coef: -22025.3605)
 9. env_gov_interaction                 (coef: 55718.0150)
10. overall_env_ratio                   (coef: -11002.1266)
11. weighted_sustainability             (coef: 

## 8. Feature Selection Comparison and Final Recommendations

In [13]:
# Feature Selection Summary and Consensus
def create_feature_consensus(X, univariate_s1, rf_importance_s1, best_alpha_s1, rfecv_s1, target_corr_s1, top_n=20):
    """Create consensus ranking from multiple feature selection methods"""
    
    # Initialize scoring dictionary
    feature_scores = {feature: 0 for feature in X.columns}
    
    # Univariate F-test (top 15)
    f_top_features = univariate_s1.head(15)['feature'].tolist()
    for i, feature in enumerate(f_top_features):
        feature_scores[feature] += (15 - i) * 0.2
    
    # Random Forest Importance (top 15)
    rf_top_features = rf_importance_s1.head(15)['feature'].tolist()
    for i, feature in enumerate(rf_top_features):
        feature_scores[feature] += (15 - i) * 0.2
    
    # Lasso selected features
    lasso_features = best_alpha_s1['selected_features']
    for feature in lasso_features:
        feature_scores[feature] += 10 * 0.2
    
    # RFECV selected features
    for feature in rfecv_s1:
        feature_scores[feature] += 10 * 0.2
    
    # Target correlation (top 15)
    corr_top_features = target_corr_s1.abs().head(15).index.tolist()
    for i, feature in enumerate(corr_top_features):
        feature_scores[feature] += (15 - i) * 0.2
    
    # Sort by consensus score
    consensus_ranking = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
    
    return consensus_ranking[:top_n]

print("FINAL FEATURE SELECTION CONSENSUS")
print("="*80)

# Create consensus for both targets
print("\nüìä SCOPE 1 EMISSIONS - CONSENSUS TOP 20 FEATURES")
print("-" * 60)
consensus_s1 = create_feature_consensus(X, univariate_scope1, rf_importance_s1, best_alpha_s1, rfecv_s1, target_corr_s1)

print("Rank | Feature | Consensus Score")
print("-" * 60)
for i, (feature, score) in enumerate(consensus_s1, 1):
    print(f"{i:4d} | {feature[:35]:<35} | {score:12.2f}")

print("\nüìä SCOPE 2 EMISSIONS - CONSENSUS TOP 20 FEATURES")
print("-" * 60)
consensus_s2 = create_feature_consensus(X, univariate_scope2, rf_importance_s2, best_alpha_s2, rfecv_s2, target_corr_s2)

print("Rank | Feature | Consensus Score")
print("-" * 60)
for i, (feature, score) in enumerate(consensus_s2, 1):
    print(f"{i:4d} | {feature[:35]:<35} | {score:12.2f}")

# Feature overlap analysis
s1_features = set([f[0] for f in consensus_s1[:15]])
s2_features = set([f[0] for f in consensus_s2[:15]])
common_features = s1_features.intersection(s2_features)

print(f"\nüîç FEATURE OVERLAP ANALYSIS")
print("-" * 40)
print(f"Top 15 Scope 1 features: {len(s1_features)}")
print(f"Top 15 Scope 2 features: {len(s2_features)}")
print(f"Common features: {len(common_features)}")
print(f"Overlap percentage: {len(common_features)/15*100:.1f}%")

if len(common_features) > 0:
    print(f"\nCommon important features:")
    for i, feature in enumerate(sorted(common_features), 1):
        print(f"{i:2d}. {feature}")

FINAL FEATURE SELECTION CONSENSUS

üìä SCOPE 1 EMISSIONS - CONSENSUS TOP 20 FEATURES
------------------------------------------------------------
Rank | Feature | Consensus Score
------------------------------------------------------------
   1 | log_revenue                         |        12.40
   2 | sector_C                            |        11.80
   3 | revenue                             |        11.80
   4 | environmental_score                 |        11.20
   5 | revenue_millions                    |        10.40
   6 | sector_B                            |         8.80
   7 | revenue_squared                     |         7.60
   8 | sector_A                            |         7.00
   9 | weighted_sustainability             |         6.80
  10 | sector_D                            |         6.80
  11 | governance_score                    |         6.60
  12 | social_score                        |         6.40
  13 | sector_J                            |         6.20
  14 

In [None]:
# Performance Comparison with Different Feature Sets
def evaluate_feature_sets(X, y_scope1, y_scope2):
    """Evaluate model performance with different feature selection methods"""
    
    print("\nüöÄ MODEL PERFORMANCE COMPARISON")
    print("="*70)
    
    # Define feature sets to test
    feature_sets = {
        'All Features': X.columns.tolist(),
        'Top 15 RF Importance S1': rf_importance_s1.head(15)['feature'].tolist(),
        'Top 15 RF Importance S2': rf_importance_s2.head(15)['feature'].tolist(),
        'Lasso Selected S1': list(best_alpha_s1['selected_features']),
        'Lasso Selected S2': list(best_alpha_s2['selected_features']),
        'RFECV S1': list(rfecv_s1),
        'RFECV S2': list(rfecv_s2),
        'Consensus Top 15 S1': [f[0] for f in consensus_s1[:15]],
        'Consensus Top 15 S2': [f[0] for f in consensus_s2[:15]],
        'Common Features': list(common_features) if len(common_features) >= 5 else [f[0] for f in consensus_s1[:10]]
    }
    
    results = []
    
    for set_name, features in feature_sets.items():
        if len(features) == 0:
            continue
            
        # Select features that exist in X
        valid_features = [f for f in features if f in X.columns]
        if len(valid_features) < 3:  # Need at least 3 features
            continue
            
        X_subset = X[valid_features]
        
        # Random Forest Model
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        
        # Cross-validation for Scope 1
        cv_scores_s1 = cross_val_score(rf_model, X_subset, y_scope1, cv=5, scoring='neg_mean_squared_error')
        rmse_s1 = np.sqrt(-cv_scores_s1.mean())
        
        # Cross-validation for Scope 2
        cv_scores_s2 = cross_val_score(rf_model, X_subset, y_scope2, cv=5, scoring='neg_mean_squared_error')
        rmse_s2 = np.sqrt(-cv_scores_s2.mean())
        
        results.append({
            'Feature Set': set_name,
            'N Features': len(valid_features),
            'Scope 1 RMSE': rmse_s1,
            'Scope 2 RMSE': rmse_s2,
            'Combined RMSE': (rmse_s1 + rmse_s2) / 2
        })
    
    # Create results DataFrame
    results_df = pd.DataFrame(results).sort_values('Combined RMSE')
    
    print("Feature Set Performance (Random Forest, 5-Fold CV):")
    print("-" * 70)
    print(f"{'Feature Set':<20} | {'Features':<8} | {'S1 RMSE':<8} | {'S2 RMSE':<8} | {'Avg RMSE':<8}")
    print("-" * 70)
    
    for _, row in results_df.iterrows():
        print(f"{row['Feature Set']:<20} | {row['N Features']:<8d} | {row['Scope 1 RMSE']:<8.2f} | {row['Scope 2 RMSE']:<8.2f} | {row['Combined RMSE']:<8.2f}")
    
    return results_df

# Run performance comparison
performance_results = evaluate_feature_sets(X, y_scope1, y_scope2)

# Best performing feature set
best_set = performance_results.iloc[0]
print(f"\nüèÜ BEST PERFORMING FEATURE SET:")
print(f"Method: {best_set['Feature Set']}")
print(f"Number of features: {best_set['N Features']}")
print(f"Scope 1 RMSE: {best_set['Scope 1 RMSE']:.2f}")
print(f"Scope 2 RMSE: {best_set['Scope 2 RMSE']:.2f}")
print(f"Average RMSE: {best_set['Combined RMSE']:.2f}")

print(f"\nüí° RECOMMENDATIONS:")
print(f"1. Use '{best_set['Feature Set']}' for optimal performance")
print(f"2. Focus on {best_set['N Features']} carefully selected features rather than all {X.shape[1]} features")
print(f"3. Consider ensemble methods combining multiple feature selection approaches")
print(f"4. Revenue-based features and sector information appear to be key predictors")
print(f"5. Environmental scores show strong predictive power for emissions")


üöÄ MODEL PERFORMANCE COMPARISON


## 9. Export Selected Features for Model Training

Based on the analysis above, we'll export the best feature sets for use in your final models.

In [None]:
# Export optimal feature sets for final model training
import json

# Get the best performing feature set from the analysis
best_feature_set_name = best_set['Feature Set']
if best_feature_set_name == 'Consensus Top 15 S1':
    optimal_features_s1 = [f[0] for f in consensus_s1[:15]]
    optimal_features_s2 = [f[0] for f in consensus_s2[:15]]
elif best_feature_set_name == 'Consensus Top 15 S2':
    optimal_features_s1 = [f[0] for f in consensus_s1[:15]]
    optimal_features_s2 = [f[0] for f in consensus_s2[:15]]
else:
    # Default to consensus features
    optimal_features_s1 = [f[0] for f in consensus_s1[:15]]
    optimal_features_s2 = [f[0] for f in consensus_s2[:15]]

# Create feature selection results dictionary
feature_selection_results = {
    'analysis_summary': {
        'total_features_engineered': X.shape[1],
        'best_performing_method': best_feature_set_name,
        'best_combined_rmse': float(best_set['Combined RMSE']),
        'best_scope1_rmse': float(best_set['Scope 1 RMSE']),
        'best_scope2_rmse': float(best_set['Scope 2 RMSE'])
    },
    'optimal_features': {
        'scope_1_features': optimal_features_s1,
        'scope_2_features': optimal_features_s2,
        'common_features': list(common_features) if len(common_features) > 0 else []
    },
    'method_results': {
        'random_forest_top_features_s1': rf_importance_s1.head(10)['feature'].tolist(),
        'random_forest_top_features_s2': rf_importance_s2.head(10)['feature'].tolist(),
        'lasso_selected_s1': list(best_alpha_s1['selected_features'][:10]),
        'lasso_selected_s2': list(best_alpha_s2['selected_features'][:10]),
        'univariate_top_s1': univariate_scope1.head(10)['feature'].tolist(),
        'univariate_top_s2': univariate_scope2.head(10)['feature'].tolist()
    },
    'feature_importance_insights': {
        'revenue_features_important': any('revenue' in f for f in optimal_features_s1[:5]),
        'sector_features_important': any('sector' in f for f in optimal_features_s1[:5]),
        'environmental_score_important': any('environmental' in f for f in optimal_features_s1[:5]),
        'geographic_features_important': any(any(geo in f for geo in ['region', 'country']) for f in optimal_features_s1[:5])
    }
}

# Save to JSON file
with open('notebooks/feature_selection_results.json', 'w') as f:
    json.dump(feature_selection_results, f, indent=2)

# Create optimized datasets with selected features
X_optimized_s1 = X[optimal_features_s1].copy()
X_optimized_s2 = X[optimal_features_s2].copy()

# Add entity_id and targets back for easy model training
optimized_data_s1 = X_optimized_s1.copy()
optimized_data_s1['entity_id'] = feature_data['entity_id']
optimized_data_s1['target_scope_1'] = y_scope1

optimized_data_s2 = X_optimized_s2.copy()
optimized_data_s2['entity_id'] = feature_data['entity_id']
optimized_data_s2['target_scope_2'] = y_scope2

# Save optimized datasets
optimized_data_s1.to_pickle('notebooks/optimized_features_scope1.pkl')
optimized_data_s2.to_pickle('notebooks/optimized_features_scope2.pkl')

print("‚úÖ FEATURE SELECTION COMPLETE!")
print("="*50)
print(f"üìÅ Results saved to: 'notebooks/feature_selection_results.json'")
print(f"üìÅ Optimized Scope 1 dataset: 'notebooks/optimized_features_scope1.pkl'")
print(f"üìÅ Optimized Scope 2 dataset: 'notebooks/optimized_features_scope2.pkl'")

print(f"\nüìã FINAL RECOMMENDATIONS:")
print(f"‚Ä¢ Use {len(optimal_features_s1)} features for Scope 1 prediction")
print(f"‚Ä¢ Use {len(optimal_features_s2)} features for Scope 2 prediction")
print(f"‚Ä¢ Expected performance improvement: {((cv_scores_s1.mean() - best_set['Scope 1 RMSE'])/cv_scores_s1.mean()*100):.1f}% for Scope 1")
print(f"‚Ä¢ Focus on feature engineering around: revenue, sector exposure, environmental scores")
print(f"‚Ä¢ Consider ensemble methods combining multiple algorithms")

print(f"\nüéØ TOP 10 MOST IMPORTANT FEATURES FOR EACH TARGET:")
print("Scope 1:", optimal_features_s1[:10])
print("Scope 2:", optimal_features_s2[:10])