In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


env_activities_df = pd.read_csv('./data/environmental_activities.csv')
train_df = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sector_df = pd.read_csv('./data/revenue_distribution_by_sector.csv')
sdg_df = pd.read_csv('./data/sustainable_development_goals.csv')


In [42]:
def create_comprehensive_features(train_data, sector_data, env_data, sdg_data):
    """Create a comprehensive set of engineered features"""
    
    # Start with base features
    features = train_data.copy()
    
    # 1. Geographic Features (One-hot encoding)
    if 'region_code' in features.columns:
        region_dummies = pd.get_dummies(features['region_code'], prefix='region')
        features = pd.concat([features, region_dummies], axis=1)
    
    # Country diversity (simplified)
    if 'country_code' in features.columns:
        country_dummies = pd.get_dummies(features['country_code'], prefix='country')
        features = pd.concat([features, country_dummies], axis=1)
    
    # 2. Revenue-based features
    if 'revenue' in features.columns:
        features['log_revenue'] = np.log1p(features['revenue'])
        features['revenue_millions'] = features['revenue'] / 1e6
        features['revenue_squared'] = features['revenue'] ** 2
    
    # 3. Sustainability score interactions
    score_columns = ['environmental_score', 'social_score', 'governance_score', 'overall_score']
    available_scores = [col for col in score_columns if col in features.columns]
    
    if 'environmental_score' in features.columns and 'governance_score' in features.columns:
        features['env_gov_interaction'] = features['environmental_score'] * features['governance_score']
    
    if 'overall_score' in features.columns and 'environmental_score' in features.columns:
        # Avoid division by zero
        features['overall_env_ratio'] = features['overall_score'] / features['environmental_score'].replace(0, np.nan)
        features['overall_env_ratio'] = features['overall_env_ratio'].fillna(0)
    
    if all(score in features.columns for score in ['environmental_score', 'social_score', 'governance_score']):
        features['weighted_sustainability'] = (
            0.45 * features['environmental_score'] + 
            0.30 * features['social_score'] + 
            0.25 * features['governance_score']
        )
    
    # 4. Sector-based features
    if not sector_data.empty and 'entity_id' in sector_data.columns:
        sector_pivot = sector_data.pivot_table(
            values='revenue_pct',
            index='entity_id',
            columns='nace_level_1_code',
            aggfunc='sum',
            fill_value=0
        ).add_prefix('sector_')
        
        # Sector diversity metrics
        sector_counts = sector_data.groupby('entity_id').size().rename('sector_diversity')
        sector_max_pct = sector_data.groupby('entity_id')['revenue_pct'].max().rename('max_sector_concentration')
        
        def calculate_entropy(x):
            # Add small epsilon to avoid log(0)
            x_normalized = x / (x.sum() + 1e-10)
            return -np.sum(x_normalized * np.log(x_normalized + 1e-10))
        
        sector_entropy = sector_data.groupby('entity_id')['revenue_pct'].apply(
            calculate_entropy
        ).rename('sector_entropy')
        
        # Merge sector features
        sector_features = [sector_pivot, sector_counts, sector_max_pct, sector_entropy]
        for sector_feat in sector_features:
            features = features.merge(sector_feat, left_on='entity_id', right_index=True, how='left')
    
   # 5. Environmental activities features
    if not env_data.empty and 'entity_id' in env_data.columns:
    
        # Clean activity_type to avoid hidden characters
        env_data = env_data.copy()
        env_data['activity_type'] = (
            env_data['activity_type']
            .astype(str)
            .str.strip()
            .str.replace('\xa0', '', regex=False)
        )
    
        # Aggregations
        env_agg = env_data.groupby('entity_id').agg({
            'env_score_adjustment': ['sum', 'mean', 'count', 'std']
        }).fillna(0)
        env_agg.columns = ['env_adj_sum', 'env_adj_mean', 'env_activities_count', 'env_adj_std']
    
        features = features.merge(env_agg, left_on='entity_id', right_index=True, how='left')
    
        # Unique activity types
        env_types = env_data.groupby('entity_id')['activity_type'].nunique().rename('env_activity_types')
        features = features.merge(env_types, left_on='entity_id', right_index=True, how='left')
    
        # Harmful / beneficial sums and counts
        env_harmful = env_data[env_data['env_score_adjustment'] > 0].groupby('entity_id')['env_score_adjustment'].sum().rename('env_adj_harmful_sum')
        env_beneficial = env_data[env_data['env_score_adjustment'] < 0].groupby('entity_id')['env_score_adjustment'].sum().rename('env_adj_beneficial_sum')
        env_num_harmful = (env_data['env_score_adjustment'] > 0).groupby(env_data['entity_id']).sum().rename('env_num_harmful')
        env_num_beneficial = (env_data['env_score_adjustment'] < 0).groupby(env_data['entity_id']).sum().rename('env_num_beneficial')
    
        env_net = env_harmful.add(env_beneficial, fill_value=0).rename('env_adj_net')
    
        for env_feat in [env_harmful, env_beneficial, env_net, env_num_harmful, env_num_beneficial]:
            features = features.merge(env_feat, left_on='entity_id', right_index=True, how='left')
    
        # NEW: One-hot encode activity_type and aggregate per company
        activity_dummies = pd.get_dummies(
            env_data['activity_type'],
            prefix='env_acttype',
            dtype=int
        )

    # Add entity_id back to align
    activity_dummies = pd.concat([env_data[['entity_id']], activity_dummies], axis=1)

    # Sum per company (so multiple activities add up)
    activity_counts = activity_dummies.groupby('entity_id').sum()

    #  Avoid column overlap when merging
    non_overlapping_cols = [col for col in activity_counts.columns if col not in features.columns]

    activity_counts = activity_counts[non_overlapping_cols]

    features = features.merge(activity_counts, left_on='entity_id', right_index=True, how='left')
    
    # 6. SDG features
    if not sdg_data.empty and 'entity_id' in sdg_data.columns:
        sdg_agg = sdg_data.groupby('entity_id').agg({
            'sdg_id': ['count', 'nunique']
        }).fillna(0)
        sdg_agg.columns = ['sdg_commitments', 'unique_sdgs']
        
        # Climate-related SDGs (6, 7, 13, 14, 15)
        if 'sdg_id' in sdg_data.columns:
            climate_sdgs = sdg_data[sdg_data['sdg_id'].isin([6, 7, 13, 14, 15])]
            climate_sdg_count = climate_sdgs.groupby('entity_id').size().rename('climate_sdg_count')
            features = features.merge(climate_sdg_count, left_on='entity_id', right_index=True, how='left')
        
        features = features.merge(sdg_agg, left_on='entity_id', right_index=True, how='left')
    
    # Fill missing values for all newly created columns
    # Identify newly created feature columns (excluding original train_data columns)
    original_columns = set(train_data.columns)
    new_columns = [col for col in features.columns if col not in original_columns]
    
    # Fill only the new feature columns with 0
    features[new_columns] = features[new_columns].fillna(0)
    
    return features



# Create comprehensive feature set
print("Creating comprehensive feature set...")
feature_data = create_comprehensive_features(train_df, sector_df, env_activities_df, sdg_df)

print(f"Total features created: {feature_data.shape[1]}")
print(f"Feature columns: {len([col for col in feature_data.columns if col not in ['entity_id', 'target_scope_1', 'target_scope_2']])}")

# Display feature summary
print("\nFeature categories created:")
print(f"- Geographic features: {len([col for col in feature_data.columns if col.startswith(('region_', 'country_'))])}")
print(f"- Sector features: {len([col for col in feature_data.columns if col.startswith('sector_')])}")
print(f"- Environmental features: {len([col for col in feature_data.columns if col.startswith('env_')])}")
print(f"- SDG features: {len([col for col in feature_data.columns if col.startswith(('sdg_', 'climate_'))])}")
print(f"- Revenue features: {len([col for col in feature_data.columns if 'revenue' in col])}")
print(f"- Sustainability features: {len([col for col in feature_data.columns if any(x in col for x in ['score', 'sustainability'])])}")
feature_data.to_csv('output_for_enviornmental_FeatureEngineering.csv')

Creating comprehensive feature set...
Total features created: 97
Feature columns: 94

Feature categories created:
- Geographic features: 39
- Sector features: 22
- Environmental features: 19
- SDG features: 2
- Revenue features: 4
- Sustainability features: 5


#### Create and export featuring dataset for enviornmental_activities, then use this dataset to find correlations and proof hypothesis
* move to EDAonEnvAct.ipyb