## 1. Environment & Imports

In [None]:
import pandas as pd
import numpy as np
import json, os, joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
sns.set_style('whitegrid')
plt.rcParams['figure.figsize']=(10,5)
print('âœ… Environment ready.')

## 2. Load Data

In [None]:
DATA_DIR = 'data'
test_path = os.path.join(DATA_DIR, 'test.csv')
sect_path = os.path.join(DATA_DIR, 'revenue_distribution_by_sector.csv')
env_path = os.path.join(DATA_DIR, 'environmental_activities.csv')
sdg_path = os.path.join(DATA_DIR, 'sustainable_development_goals.csv')
test = pd.read_csv(test_path)
sect = pd.read_csv(sect_path)
env = pd.read_csv(env_path)
sdg = pd.read_csv(sdg_path) if os.path.exists(sdg_path) else pd.DataFrame()
print(f'Test Shape: {test.shape}; Sector rows: {sect.shape[0]}; Env rows: {env.shape[0]}')

## 3. Feature Engineering Helpers (Mirrors Training)

In [None]:
def engineer_features(df_main, df_sect, df_env, df_sdg):
    df = df_main.copy()
    high_emission_sectors = ['B','C','D','H']
    df_sect = df_sect.copy()
    df_sect['is_high_intensity'] = df_sect['nace_level_1_code'].isin(high_emission_sectors)
    intensity_feat = df_sect.groupby('entity_id').apply(
        lambda x: (x['revenue_pct'] * x['is_high_intensity']).sum()
    ).to_frame('high_intensity_revenue_share')
    env_feats = df_env.groupby('entity_id').agg(
        env_activity_count=('activity_code','count'),
        env_net_adj=('env_score_adjustment','sum')
    )
    df = (
        df.merge(intensity_feat, on='entity_id', how='left')
          .merge(env_feats, on='entity_id', how='left')
    )
    df['high_intensity_revenue_share'] = df['high_intensity_revenue_share'].fillna(0)
    df['env_activity_count'] = df['env_activity_count'].fillna(0)
    df['env_net_adj'] = df['env_net_adj'].fillna(0)
    df['log_revenue'] = np.log1p(df['revenue'])
    df['log_rev_x_intensity'] = df['log_revenue'] * df['high_intensity_revenue_share']
    known_regions = ['ANZ','CAR','EA','EEU','LATAM','NAM','WEU']
    for region in known_regions:
        df[f'log_rev_x_region_{region}'] = df['log_revenue'] * (df['region_code'] == region).astype(int)
    df['soc_env_gap'] = df['social_score'] - df['environmental_score']
    return df

def add_target_encoding(df, df_sect, mapping):
    dom = df_sect.sort_values('revenue_pct', ascending=False).drop_duplicates('entity_id')
    df = df.merge(dom[['entity_id','nace_level_1_code']], on='entity_id', how='left')
    df['sector_implied_target'] = df['nace_level_1_code'].map(mapping).fillna(df['log_revenue'])
    return df.drop(columns=['nace_level_1_code'])
print('âœ… Feature engineering helpers ready.')

## 4. Load Persisted Models & Artifacts

In [None]:
model_s1 = joblib.load('models/model_scope1_base.joblib')
model_s2 = joblib.load('models/model_scope2_base.joblib')
model_te_s1 = joblib.load('models/model_scope1_te.joblib')
with open('models/sector_mapping_s1.json') as f: sector_map = json.load(f)
with open('models/blend_config.json') as f: blend_cfg = json.load(f)
print('âœ… Models & mappings loaded.')
blend_cfg

## 5. Engineer Features for Unseen Test

In [None]:
X_test_base = engineer_features(test, sect, env, sdg)
X_test_te = add_target_encoding(X_test_base, sect, sector_map)
print(f'Engineered feature shape (base): {X_test_base.shape}; with TE: {X_test_te.shape}')

## 6. Predict & Blend

In [None]:
p_s1_base_log = model_s1.predict(X_test_base)
p_s2_base_log = model_s2.predict(X_test_base)
p_s1_te_log = model_te_s1.predict(X_test_te)
# Invert log1p transformation
p_s1_base = np.expm1(p_s1_base_log)
p_s1_te = np.expm1(p_s1_te_log)
p_s2_base = np.expm1(p_s2_base_log)
w_base = blend_cfg['scope1_blend_weights']['base']
w_te = blend_cfg['scope1_blend_weights']['target_encoded']
final_s1 = w_base * p_s1_base + w_te * p_s1_te
final_s2 = p_s2_base  # base only per config
inference_submission = pd.DataFrame({'entity_id': test['entity_id'], 'target_scope_1': final_s1, 'target_scope_2': final_s2})
inference_submission.head()

## 7. Save Submission

In [None]:
inference_submission.to_csv('inference_submission.csv', index=False)
print('ðŸ’¾ Saved predictions to inference_submission.csv')

## 8. Diagnostics (Feature Importance & Distribution)

In [None]:
def quick_numeric_names(df):
    return [c for c in df.select_dtypes(include=['number']).columns if c not in ['entity_id']]
feat_names = quick_numeric_names(X_test_base)
imps = model_s1.named_steps['est'].feature_importances_[:len(feat_names)]
imp_df = pd.DataFrame({'Feature': feat_names[:len(imps)], 'Importance': imps}).sort_values('Importance', ascending=False).head(10)
plt.figure(figsize=(8,4)); sns.barplot(x='Importance', y='Feature', data=imp_df, palette='viridis'); plt.title('Top Scope 1 Predictors (Base)'); plt.show()
sns.histplot(final_s1, bins=30, color='steelblue'); plt.title('Distribution: Predicted Scope 1'); plt.show()
sns.histplot(final_s2, bins=30, color='darkorange'); plt.title('Distribution: Predicted Scope 2'); plt.show()

## 9. Completion
Inference complete. Provide `inference_submission.csv` as your output.