In [1]:
# Import Libraries & Requirements
!pip install -r requirements.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os



In [2]:
# Load CSV Data Set
environmental_activities_df = pd.read_csv('data/environmental_activities.csv')
revenue_distribution_by_sector_df = pd.read_csv('data/revenue_distribution_by_sector.csv')
sustainable_development_goals_df = pd.read_csv('data/sustainable_development_goals.csv')
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,entity_id,region_code,region_name,country_code,country_name,revenue,overall_score,environmental_score,social_score,governance_score,target_scope_1,target_scope_2
0,1782,WEU,Western Europe,GB,United Kingdom of Great Britain and Northern I...,352806000.0,2.988,3.9,1.75,2.833,60.0,0.0
1,3918,NAM,Northern America,US,United States of America,1513700000.0,2.77,3.004,2.942,2.143,265.0,0.0
2,10299,WEU,Western Europe,FR,France,1560000000.0,2.501,2.979,2.56,1.571,1136.0,0.0
3,2324,NAM,Northern America,US,United States of America,12385110000.0,3.207,3.776,3.0,2.429,1468.0,0.0
4,1206,WEU,Western Europe,ES,Spain,2980000000.0,1.998,2.138,1.785,2.0,1802.0,0.0


In [4]:
train_df.shape

(429, 12)

In [5]:
train_df.columns

Index(['entity_id', 'region_code', 'region_name', 'country_code',
       'country_name', 'revenue', 'overall_score', 'environmental_score',
       'social_score', 'governance_score', 'target_scope_1', 'target_scope_2'],
      dtype='object')

In [6]:
train_df.dtypes

entity_id                int64
region_code             object
region_name             object
country_code            object
country_name            object
revenue                float64
overall_score          float64
environmental_score    float64
social_score           float64
governance_score       float64
target_scope_1         float64
target_scope_2         float64
dtype: object

In [7]:
test_df.head()

Unnamed: 0,entity_id,region_code,region_name,country_code,country_name,revenue,overall_score,environmental_score,social_score,governance_score
0,1076,WEU,Western Europe,NL,Netherlands,1670000000.0,3.17,3.94,2.692,2.357
1,2067,WEU,Western Europe,GB,United Kingdom of Great Britain and Northern I...,588000000.0,2.976,4.0,2.014,2.286
2,910,WEU,Western Europe,DE,Germany,1218100000.0,2.835,3.258,2.955,1.929
3,4082,WEU,Western Europe,DE,Germany,5037500000.0,2.861,3.36,2.95,1.857
4,4102,WEU,Western Europe,SE,Sweden,1415400000.0,2.95,3.55,2.9,1.929


In [8]:
test_df.shape

(49, 10)

In [9]:
environmental_activities_df.head()

Unnamed: 0,entity_id,activity_type,activity_code,env_score_adjustment
0,2709,Transportation,M.70.4.P,0.05
1,107,Operation,MTH002,-0.1
2,10045,Operation,MTH002,-0.1
3,2709,End-use,J.58.16.B,-0.05
4,2677,Operation,J.58.20.P,0.1


In [10]:
environmental_activities_df.shape

(355, 4)

In [11]:
environmental_activities_df.columns

Index(['entity_id', 'activity_type', 'activity_code', 'env_score_adjustment'], dtype='object')

In [12]:
environmental_activities_df.dtypes

entity_id                 int64
activity_type            object
activity_code            object
env_score_adjustment    float64
dtype: object

In [13]:
revenue_distribution_by_sector_df.head()

Unnamed: 0,entity_id,nace_level_1_code,nace_level_1_name,nace_level_2_code,nace_level_2_name,revenue_pct
0,1735,A,"Agriculture, Forestry And Fishing",1,"Crop and animal production, hunting and relate...",0.031105
1,1195,A,"Agriculture, Forestry And Fishing",1,"Crop and animal production, hunting and relate...",0.362906
2,4092,A,"Agriculture, Forestry And Fishing",1,"Crop and animal production, hunting and relate...",0.22219
3,3592,A,"Agriculture, Forestry And Fishing",1,"Crop and animal production, hunting and relate...",0.063879
4,3169,A,"Agriculture, Forestry And Fishing",1,"Crop and animal production, hunting and relate...",0.42281


In [14]:
revenue_distribution_by_sector_df.shape

(799, 6)

In [15]:
revenue_distribution_by_sector_df.columns

Index(['entity_id', 'nace_level_1_code', 'nace_level_1_name',
       'nace_level_2_code', 'nace_level_2_name', 'revenue_pct'],
      dtype='object')

In [16]:
revenue_distribution_by_sector_df.dtypes

entity_id              int64
nace_level_1_code     object
nace_level_1_name     object
nace_level_2_code      int64
nace_level_2_name     object
revenue_pct          float64
dtype: object

In [17]:
sustainable_development_goals_df.head

<bound method NDFrame.head of      entity_id  sdg_id                                 sdg_name
0           29       9  Industry, Innovation and Infrastructure
1           46      12   Responsible Consumption and Production
2           46       7              Affordable and Clean Energy
3           63       3                Good Health and Wellbeing
4          106       9  Industry, Innovation and Infrastructure
..         ...     ...                                      ...
160      10451       3                Good Health and Wellbeing
161      10704       9  Industry, Innovation and Infrastructure
162      10727       3                Good Health and Wellbeing
163      10760       3                Good Health and Wellbeing
164      10764       3                Good Health and Wellbeing

[165 rows x 3 columns]>

In [18]:
sustainable_development_goals_df.shape

(165, 3)

In [19]:
sustainable_development_goals_df.columns

Index(['entity_id', 'sdg_id', 'sdg_name'], dtype='object')

In [20]:
sustainable_development_goals_df.dtypes

entity_id     int64
sdg_id        int64
sdg_name     object
dtype: object

In [21]:
# Data Cleaning - Check null values

def analyze_missing_values(df, dataset_name):
    """
    Comprehensive missing value analysis for a dataframe
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe to analyze
    dataset_name : str
        Name of the dataset for display purposes
    
    Returns:
    --------
    pandas DataFrame with missing value statistics
    """
    
    print(f"\n{'='*60}")
    print(f"MISSING VALUE ANALYSIS: {dataset_name}")
    print(f"{'='*60}\n")
    
    # Calculate missing values
    missing_count = df.isnull().sum()
    missing_pct = (df.isnull().sum() / len(df)) * 100
    dtypes = df.dtypes
    
    # Create summary dataframe
    missing_df = pd.DataFrame({
        'Column': df.columns,
        'Data_Type': dtypes.values,
        'Missing_Count': missing_count.values,
        'Missing_Percentage': missing_pct.values,
        'Non_Missing_Count': len(df) - missing_count.values
    })
    
    # Sort by missing percentage descending
    missing_df = missing_df.sort_values('Missing_Percentage', ascending=False)
    
    # Filter to show only columns with missing values
    missing_cols = missing_df[missing_df['Missing_Count'] > 0]
    
    if len(missing_cols) == 0:
        print("âœ“ No missing values found in this dataset!\n")
    else:
        print(f"âš  Found {len(missing_cols)} columns with missing values:\n")
        print(missing_cols.to_string(index=False))
        print(f"\nTotal missing values: {missing_count.sum():,}")
        print(f"Percentage of dataset with any missing value: {(df.isnull().any(axis=1).sum() / len(df) * 100):.2f}%")
    
    return missing_df

In [22]:
# Analyze train dataset
train_missing = analyze_missing_values(train_df, "TRAIN_DF")
test_missing = analyze_missing_values(test_df, "TEST_DF")
environmental_activities_missing = analyze_missing_values(environmental_activities_df, "ENVIRONMENTAL_DF")
revenue_distribution_by_sector_missing = analyze_missing_values(revenue_distribution_by_sector_df, "TRAIN_DF")
sustainable_development_goals_missing = analyze_missing_values(sustainable_development_goals_df, "TRAIN_DF")


MISSING VALUE ANALYSIS: TRAIN_DF

âœ“ No missing values found in this dataset!


MISSING VALUE ANALYSIS: TEST_DF

âœ“ No missing values found in this dataset!


MISSING VALUE ANALYSIS: ENVIRONMENTAL_DF

âœ“ No missing values found in this dataset!


MISSING VALUE ANALYSIS: TRAIN_DF

âœ“ No missing values found in this dataset!


MISSING VALUE ANALYSIS: TRAIN_DF

âœ“ No missing values found in this dataset!



In [24]:
def analyze_duplicates(df, dataset_name):
    """
    Comprehensive duplicate row analysis for a dataframe
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe to analyze
    dataset_name : str
        Name of the dataset for display purposes
    
    Returns:
    --------
    pandas DataFrame with duplicate statistics
    """
    
    print(f"\n{'='*60}")
    print(f"DUPLICATE ROW ANALYSIS: {dataset_name}")
    print(f"{'='*60}\n")
    
    # Count full-row duplicates
    duplicate_mask = df.duplicated(keep=False)
    total_duplicates = duplicate_mask.sum()
    unique_duplicate_rows = df[duplicate_mask].drop_duplicates()
    
    if total_duplicates == 0:
        print("âœ“ No duplicate rows found in this dataset!\n")
        return pd.DataFrame()  # empty result
    
    # Count how many times each duplicated row appears
    duplicate_summary = (
        df[df.duplicated(keep=False)]
        .value_counts()
        .reset_index(name='Count')
        .sort_values('Count', ascending=False)
    )
    
    print(f"âš  Found {len(unique_duplicate_rows)} unique duplicated rows.")
    print(f"Total duplicated entries (including repeats): {total_duplicates}\n")
    
    print("Most common duplicate patterns:")
    display(duplicate_summary.head(10))
    
    return duplicate_summary


In [25]:
# Run the duplicate-value method on each dataset
train_duplicates = analyze_duplicates(train_df, "TRAIN_DF")
test_duplicates = analyze_duplicates(test_df, "TEST_DF")
environmental_activities_duplicates = analyze_duplicates(environmental_activities_df, "ENVIRONMENTAL_DF")
revenue_distribution_by_sector_duplicates = analyze_duplicates(revenue_distribution_by_sector_df, "REVENUE_DF")
sustainable_development_goals_duplicates = analyze_duplicates(sustainable_development_goals_df, "SDG_DF")


DUPLICATE ROW ANALYSIS: TRAIN_DF

âœ“ No duplicate rows found in this dataset!


DUPLICATE ROW ANALYSIS: TEST_DF

âœ“ No duplicate rows found in this dataset!


DUPLICATE ROW ANALYSIS: ENVIRONMENTAL_DF

âœ“ No duplicate rows found in this dataset!


DUPLICATE ROW ANALYSIS: REVENUE_DF

âœ“ No duplicate rows found in this dataset!


DUPLICATE ROW ANALYSIS: SDG_DF

âœ“ No duplicate rows found in this dataset!



In [26]:
# Feature Engineering - revenue_distribution_by_sector_df CHECKING REVENUE DISTRIBUTION COVERAGE

# Get unique entity_ids from each dataset
train_entities = set(train_df['entity_id'].unique())
test_entities = set(test_df['entity_id'].unique())
revenue_entities = set(revenue_distribution_by_sector_df['entity_id'].unique())

print(f"Unique entities in train: {len(train_entities):,}")
print(f"Unique entities in test: {len(test_entities):,}")
print(f"Unique entities in revenue_distribution: {len(revenue_entities):,}")

# Check coverage
train_covered = train_entities & revenue_entities
test_covered = test_entities & revenue_entities

train_missing = train_entities - revenue_entities
test_missing = test_entities - revenue_entities

print("\n" + "="*60)
print("COVERAGE ANALYSIS")
print("="*60)

print(f"\nTRAIN dataset:")
print(f"  Entities WITH revenue data: {len(train_covered):,} ({len(train_covered)/len(train_entities)*100:.2f}%)")
print(f"  Entities WITHOUT revenue data: {len(train_missing):,} ({len(train_missing)/len(train_entities)*100:.2f}%)")

print(f"\nTEST dataset:")
print(f"  Entities WITH revenue data: {len(test_covered):,} ({len(test_covered)/len(test_entities)*100:.2f}%)")
print(f"  Entities WITHOUT revenue data: {len(test_missing):,} ({len(test_missing)/len(test_entities)*100:.2f}%)")

# Check if there are entities in revenue table that aren't in train/test
orphan_entities = revenue_entities - train_entities - test_entities
print(f"\nEntities in revenue table but NOT in train/test: {len(orphan_entities):,}")

print("\n" + "="*60)
print("IMPLICATIONS")
print("="*60)

if len(train_missing) > 0 or len(test_missing) > 0:
    print("\nâš  IMPORTANT: Some entities don't have revenue distribution data!")
    print("\nWhen merging, you have TWO options:\n")
    
    print("OPTION 1: Left join + Fill missing values with defaults")
    print("  - Use how='left' when merging")
    print("  - Fill NaN values with appropriate defaults:")
    print("    â€¢ Numeric features (HHI, percentages): 0 or median")
    print("    â€¢ Categorical (dominant sector): 'UNKNOWN' or most common")
    print("  - Create a 'missing_revenue_data' flag feature")
    
    print("\nOPTION 2: Inner join (only use entities with complete data)")
    print("  - Use how='inner' when merging")
    print("  - Loses entities without revenue data")
    print(f"  - Would lose {len(train_missing)} train and {len(test_missing)} test samples")
    
    print("\nðŸ’¡ RECOMMENDATION:")
    print("  Use OPTION 1 (left join + imputation) because:")
    print("  - You need to predict ALL test entities")
    print("  - Missing revenue data itself might be a signal")
    print("  - Can't afford to lose test samples")
else:
    print("\nâœ“ Perfect coverage! All entities have revenue distribution data.")


Unique entities in train: 429
Unique entities in test: 49
Unique entities in revenue_distribution: 478

COVERAGE ANALYSIS

TRAIN dataset:
  Entities WITH revenue data: 429 (100.00%)
  Entities WITHOUT revenue data: 0 (0.00%)

TEST dataset:
  Entities WITH revenue data: 49 (100.00%)
  Entities WITHOUT revenue data: 0 (0.00%)

Entities in revenue table but NOT in train/test: 0

IMPLICATIONS

âœ“ Perfect coverage! All entities have revenue distribution data.


In [27]:
# Feature Engineering - sustainable_development_goals CHECKING SUSTAINABLE DISTRIBUTION COVERAGE

# Get unique entity_ids from each dataset
sustain_entities = set(sustainable_development_goals_df['entity_id'].unique())

print(f"Unique entities in train: {len(train_entities):,}")
print(f"Unique entities in test: {len(test_entities):,}")
print(f"Unique entities in sustain_distribution: {len(sustain_entities):,}")

# Check coverage
train_covered = train_entities & sustain_entities
test_covered = test_entities & sustain_entities

train_missing = train_entities - sustain_entities
test_missing = test_entities - sustain_entities

print("\n" + "="*60)
print("COVERAGE ANALYSIS")
print("="*60)

print(f"\nTRAIN dataset:")
print(f"  Entities WITH sustainable_development_goals data: {len(train_covered):,} ({len(train_covered)/len(train_entities)*100:.2f}%)")
print(f"  Entities WITHOUT sustainable_development_goals data: {len(train_missing):,} ({len(train_missing)/len(train_entities)*100:.2f}%)")

print(f"\nTEST dataset:")
print(f"  Entities WITH sustainable_development_goals data: {len(test_covered):,} ({len(test_covered)/len(test_entities)*100:.2f}%)")
print(f"  Entities WITHOUT sustainable_development_goals data: {len(test_missing):,} ({len(test_missing)/len(test_entities)*100:.2f}%)")

# Check if there are entities in sustain table that aren't in train/test
orphan_entities = sustain_entities - train_entities - test_entities
print(f"\nEntities in sustain table but NOT in train/test: {len(orphan_entities):,}")

print("\n" + "="*60)
print("IMPLICATIONS")
print("="*60)

if len(train_missing) > 0 or len(test_missing) > 0:
    print("\nâš  IMPORTANT: Some entities don't have sustain distribution data!")
else:
    print("\nâœ“ Perfect coverage! All entities have sustain distribution data.")


Unique entities in train: 429
Unique entities in test: 49
Unique entities in sustain_distribution: 130

COVERAGE ANALYSIS

TRAIN dataset:
  Entities WITH sustainable_development_goals data: 118 (27.51%)
  Entities WITHOUT sustainable_development_goals data: 311 (72.49%)

TEST dataset:
  Entities WITH sustainable_development_goals data: 12 (24.49%)
  Entities WITHOUT sustainable_development_goals data: 37 (75.51%)

Entities in sustain table but NOT in train/test: 0

IMPLICATIONS

âš  IMPORTANT: Some entities don't have sustain distribution data!


In [28]:
print("\n" + "="*60)
print("ISOLATED DATASET OF ENTITIES IN TRAIN DATASET with NO sustainable_development_goals data ")
print("="*60)
Train_To_sustain_distribution_unmatched = (
    train_df
    .merge(sustainable_development_goals_df, on="entity_id", how="left", indicator=True)
    .query("_merge == 'left_only'")
    .drop(columns=["_merge"])
)

print(Train_To_sustain_distribution_unmatched.shape)
Train_To_sustain_distribution_unmatched.head()


ISOLATED DATASET OF ENTITIES IN TRAIN DATASET with NO sustainable_development_goals data 
(311, 14)


Unnamed: 0,entity_id,region_code,region_name,country_code,country_name,revenue,overall_score,environmental_score,social_score,governance_score,target_scope_1,target_scope_2,sdg_id,sdg_name
1,3918,NAM,Northern America,US,United States of America,1513700000.0,2.77,3.004,2.942,2.143,265.0,0.0,,
2,10299,WEU,Western Europe,FR,France,1560000000.0,2.501,2.979,2.56,1.571,1136.0,0.0,,
3,2324,NAM,Northern America,US,United States of America,12385110000.0,3.207,3.776,3.0,2.429,1468.0,0.0,,
7,1418,NAM,Northern America,US,United States of America,3588600000.0,2.77,3.083,3.0,1.929,2659.0,0.0,,
9,1494,NAM,Northern America,US,United States of America,14000000000.0,3.383,3.022,4.9,2.214,4319.0,0.0,,


In [29]:
# Feature Engineering - environmental_activities_df
# CHECKING ENVIRONMENT DISTRIBUTION COVERAGE

# Get unique entity_ids from each dataset
environment_entities = set(environmental_activities_df['entity_id'].unique())

print(f"Unique entities in train: {len(train_entities):,}")
print(f"Unique entities in test: {len(test_entities):,}")
print(f"Unique entities in environment_distribution: {len(environment_entities):,}")

# Check coverage
train_covered = train_entities & environment_entities
test_covered = test_entities & environment_entities

train_missing = train_entities - environment_entities
test_missing = test_entities - environment_entities

print("\n" + "="*60)
print("COVERAGE ANALYSIS")
print("="*60)

print(f"\nTRAIN dataset:")
print(f"  Entities WITH environment data: {len(train_covered):,} ({len(train_covered)/len(train_entities)*100:.2f}%)")
print(f"  Entities WITHOUT environment data: {len(train_missing):,} ({len(train_missing)/len(train_entities)*100:.2f}%)")

print(f"\nTEST dataset:")
print(f"  Entities WITH environment data: {len(test_covered):,} ({len(test_covered)/len(test_entities)*100:.2f}%)")
print(f"  Entities WITHOUT environment data: {len(test_missing):,} ({len(test_missing)/len(test_entities)*100:.2f}%)")

# Check if there are entities in environment table that aren't in train/test
orphan_entities = environment_entities - train_entities - test_entities
print(f"\nEntities in environment table but NOT in train/test: {len(orphan_entities):,}")

print("\n" + "="*60)
print("IMPLICATIONS")
print("="*60)

if len(train_missing) > 0 or len(test_missing) > 0:
    print("\nâš  IMPORTANT: Some entities don't have environment distribution data!")
    print("\nWhen merging, you have TWO options:\n")
    
    print("OPTION 1: Left join + Fill missing values with defaults")
    print("  - Use how='left' when merging")
    print("  - Fill NaN values with appropriate defaults:")
    print("    â€¢ Numeric features (HHI, percentages): 0 or median")
    print("    â€¢ Categorical (dominant sector): 'UNKNOWN' or most common")
    print("  - Create a 'missing_environment_data' flag feature")
    
    print("\nOPTION 2: Inner join (only use entities with complete data)")
    print("  - Use how='inner' when merging")
    print("  - Loses entities without environment data")
    print(f"  - Would lose {len(train_missing)} train and {len(test_missing)} test samples")
    
    print("\nðŸ’¡ RECOMMENDATION:")
    print("  Use OPTION 1 (left join + imputation) because:")
    print("  - You need to predict ALL test entities")
    print("  - Missing environment data itself might be a signal")
    print("  - Can't afford to lose test samples")
else:
    print("\nâœ“ Perfect coverage! All entities have environment distribution data.")

Unique entities in train: 429
Unique entities in test: 49
Unique entities in environment_distribution: 260

COVERAGE ANALYSIS

TRAIN dataset:
  Entities WITH environment data: 237 (55.24%)
  Entities WITHOUT environment data: 192 (44.76%)

TEST dataset:
  Entities WITH environment data: 23 (46.94%)
  Entities WITHOUT environment data: 26 (53.06%)

Entities in environment table but NOT in train/test: 0

IMPLICATIONS

âš  IMPORTANT: Some entities don't have environment distribution data!

When merging, you have TWO options:

OPTION 1: Left join + Fill missing values with defaults
  - Use how='left' when merging
  - Fill NaN values with appropriate defaults:
    â€¢ Numeric features (HHI, percentages): 0 or median
    â€¢ Categorical (dominant sector): 'UNKNOWN' or most common
  - Create a 'missing_environment_data' flag feature

OPTION 2: Inner join (only use entities with complete data)
  - Use how='inner' when merging
  - Loses entities without environment data
  - Would lose 192 train

In [30]:
print("\n" + "="*60)
print("ISOLATED DATASET OF ENTITIES IN TRAIN DATASET with NO environmental_activities data ")
print("="*60)
Train_To_environmental_activities_unmatched = (
    train_df
    .merge(environmental_activities_df, on="entity_id", how="left", indicator=True)
    .query("_merge == 'left_only'")
    .drop(columns=["_merge"])
)

print(Train_To_environmental_activities_unmatched.shape)
Train_To_environmental_activities_unmatched.head()


ISOLATED DATASET OF ENTITIES IN TRAIN DATASET with NO environmental_activities data 
(192, 15)


Unnamed: 0,entity_id,region_code,region_name,country_code,country_name,revenue,overall_score,environmental_score,social_score,governance_score,target_scope_1,target_scope_2,activity_type,activity_code,env_score_adjustment
4,2324,NAM,Northern America,US,United States of America,12385110000.0,3.207,3.776,3.0,2.429,1468.0,0.0,,,
11,1494,NAM,Northern America,US,United States of America,14000000000.0,3.383,3.022,4.9,2.214,4319.0,0.0,,,
13,1331,NAM,Northern America,US,United States of America,2360000000.0,2.499,2.65,2.45,2.286,10725.0,0.0,,,
14,10153,WEU,Western Europe,GB,United Kingdom of Great Britain and Northern I...,1686002000.0,3.5,4.65,2.25,2.929,57313.0,0.0,,,
17,2862,NAM,Northern America,US,United States of America,203366600.0,3.039,3.258,2.9,2.81,6.11,1.76,,,


Hypothesis Analysis

In [32]:
print("="*60)
print("HYPOTHESIS VALIDATION")
print("="*60)

# First, merge revenue features to get sector info
train_with_sectors = train_df.merge(
    revenue_distribution_by_sector_df.groupby('entity_id').first()[['nace_level_1_code', 'nace_level_1_name']], 
    on='entity_id', 
    how='left'
)

# 1. HYPOTHESIS: Sector drives emissions
print("\n1. SECTOR ANALYSIS")
print("-" * 60)
sector_stats = train_with_sectors.groupby('nace_level_1_name').agg({
    'target_scope_1': ['mean', 'median', 'count'],
    'target_scope_2': ['mean', 'median']
}).round(2)
print(sector_stats)

# 2. HYPOTHESIS: Geography matters for Scope 2
print("\n2. GEOGRAPHIC ANALYSIS (Scope 2)")
print("-" * 60)
geo_stats = train_df.groupby('country_name').agg({
    'target_scope_2': ['mean', 'median', 'count']
}).round(2).sort_values(('target_scope_2', 'mean'), ascending=False).head(10)
print(geo_stats)

# 3. HYPOTHESIS: Scope 1 and 2 have different drivers
print("\n3. CORRELATION ANALYSIS")
print("-" * 60)
corr_scope = train_df['target_scope_1'].corr(train_df['target_scope_2'])
print(f"Correlation between Scope 1 and Scope 2: {corr_scope:.3f}")

# 4. HYPOTHESIS: Revenue scales emissions
print("\n4. REVENUE SCALING")
print("-" * 60)
rev_corr_1 = train_df['revenue'].corr(train_df['target_scope_1'])
rev_corr_2 = train_df['revenue'].corr(train_df['target_scope_2'])
print(f"Revenue vs Scope 1 correlation: {rev_corr_1:.3f}")
print(f"Revenue vs Scope 2 correlation: {rev_corr_2:.3f}")

HYPOTHESIS VALIDATION

1. SECTOR ANALYSIS
------------------------------------------------------------
                                                   target_scope_1             \
                                                             mean     median   
nace_level_1_name                                                              
Accommodation And Food Service Activities                84343.50   46376.00   
Administrative And Support Service Activities             2965.61    2965.61   
Agriculture, Forestry And Fishing                       136547.71   64080.00   
Construction                                             69572.73   30240.00   
Education                                                13226.60    2096.00   
Electricity, Gas, Steam And Air Conditioning Su...      109916.86   38064.00   
Financial And Insurance Activities                       15379.00   12537.00   
Financial and insurance activities                        4998.77     611.00   
Human Health And 