In [7]:
# ==============================================================================
# Cell: Simple Baseline Model Comparison (Overall Sector - Subsets)
# ==============================================================================
print("--- Starting Baseline Model Comparison for Reviewer Response ---")

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Configuration ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# --- 1. Load Data ---
print("1. Loading Data...")
try:
    df = pd.read_csv('/kaggle/input/esg-dataset/Environmental.csv', low_memory=False)
    print(f"   Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'Environmental.csv' not found. Please check the file path.")
    exit()

# --- 2. Feature Engineering (Identical to Proposed Method) ---
print("2. Replicating Feature Engineering...")
df['corpgov_cganalyticboardfemale'] = pd.to_numeric(df['corpgov_cganalyticboardfemale'], errors='coerce')
df['corpgov_boardsize'] = pd.to_numeric(df['corpgov_boardsize'], errors='coerce')

# Calculate Blau Index
female_prop = df['corpgov_cganalyticboardfemale'] / 100
male_prop = 1 - female_prop
df['Blau_Index'] = 1 - (female_prop**2 + male_prop**2)
df['Blau_Index'].fillna(0, inplace=True)

# Calculate Critical Mass Dummies
num_female_directors = (female_prop * df['corpgov_boardsize']).round()
df['has_1_woman'] = (num_female_directors == 1).astype(int)
df['has_2_women'] = (num_female_directors == 2).astype(int)
df['has_3plus_women'] = (num_female_directors >= 3).astype(int)

# --- 3. Define Constants & Preprocessing Functions ---
TARGET_COL_NAME = 'corpgov_tresgscore'
GENDER_COLS = ['corpgov_cganalyticboardfemale', 'Blau_Index', 'has_1_woman', 'has_2_women', 'has_3plus_women']
ALL_POSSIBLE_TARGETS = ['corpgov_tresgscore', 'corpgov_environmentpillarscore', 'corpgov_socialpillarscore', 'corpgov_governancepillarscore']

def preprocess_data(X_train, X_test, y_train, y_test):
    X_train_proc, X_test_proc = X_train.copy(), X_test.copy()
    y_train_proc, y_test_proc = y_train.copy(), y_test.copy()
    
    # Target Imputation
    median_val = y_train_proc[TARGET_COL_NAME].median()
    if pd.isna(median_val): median_val = df[TARGET_COL_NAME].median()
    y_train_proc[TARGET_COL_NAME] = y_train_proc[TARGET_COL_NAME].fillna(median_val)
    y_test_proc[TARGET_COL_NAME] = y_test_proc[TARGET_COL_NAME].fillna(median_val)
        
    # Drop columns logic
    missing_percent = X_train_proc.isnull().sum() / len(X_train_proc)
    cols_to_drop_missing = missing_percent[missing_percent > 0.4].index
    cols_to_drop_other = [col for col in X_train_proc.columns if ('iden' in str(col)) or ('year' in str(col) and X_train_proc[col].dtype == 'object') or X_train_proc[col].nunique() <= 1]
    cols_to_drop = list(set(cols_to_drop_missing) | set(cols_to_drop_other))
    cols_to_drop = [col for col in cols_to_drop if col not in GENDER_COLS]
    X_train_proc = X_train_proc.drop(columns=cols_to_drop)
    X_test_proc = X_test_proc.drop(columns=cols_to_drop, errors='ignore')
    
    # Numerical Imputation
    numerical_cols = X_train_proc.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X_train_proc.select_dtypes(exclude=np.number).columns.tolist()
    
    num_imputer = SimpleImputer(strategy='median')
    X_train_proc[numerical_cols] = num_imputer.fit_transform(X_train_proc[numerical_cols])
    X_test_proc[numerical_cols] = num_imputer.transform(X_test_proc[numerical_cols])
    
    # Categorical Encoding
    for col in categorical_cols:
        mode_val = X_train_proc[col].mode()[0]
        X_train_proc[col] = X_train_proc[col].fillna(mode_val)
        X_test_proc[col] = X_test_proc[col].fillna(mode_val)
        le = LabelEncoder().fit(pd.concat([X_train_proc[col].astype(str), X_test_proc[col].astype(str)]).unique())
        X_train_proc[col] = le.transform(X_train_proc[col].astype(str))
        X_test_proc[col] = le.transform(X_test_proc[col].astype(str))
        
    train_cols, test_cols = X_train_proc.columns, X_test_proc.columns
    shared_cols = list(set(train_cols) & set(test_cols))
    return X_train_proc[shared_cols], X_test_proc[shared_cols], y_train_proc, y_test_proc

def get_top_features_combined(X_train, y_train_target, n_features=25):
    # This ensures Baseline models use the exact same input features logic as the Complex model
    y = y_train_target.values.ravel()
    rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1, max_depth=10).fit(X_train, y)
    importances = pd.Series(rf.feature_importances_, index=X_train.columns)
    
    scaler_fs = StandardScaler()
    X_train_scaled_fs = scaler_fs.fit_transform(X_train)
    
    # Handle KBest
    selector_kbest = SelectKBest(f_regression, k='all').fit(X_train, y)
    
    # Handle Lasso
    lasso = Lasso(alpha=0.1, max_iter=1000, random_state=42).fit(X_train_scaled_fs, y)
    
    combined = (pd.Series(selector_kbest.scores_, index=X_train.columns).fillna(0).rank(pct=True) +
                pd.Series(np.abs(lasso.coef_), index=X_train.columns).fillna(0).rank(pct=True) +
                importances.rank(pct=True))
    return combined.nlargest(n_features).index.tolist()

# --- 4. Global Feature Selection (Done once for consistency) ---
print("3. Performing global feature selection (same as proposed method)...")
X_full = df.drop(columns=ALL_POSSIBLE_TARGETS, errors='ignore')
y_full = df[[TARGET_COL_NAME]]
X_train_full, _, y_train_full, _ = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

# Preprocess Global
X_train_proc_full, _, y_train_proc_full, _ = preprocess_data(X_train_full, X_train_full.copy(), y_train_full, y_train_full.copy())
top_25_features = get_top_features_combined(X_train_proc_full, y_train_proc_full, n_features=25)
final_feature_set = list(dict.fromkeys(GENDER_COLS + [f for f in top_25_features if f not in GENDER_COLS]))
print(f"   Identified {len(final_feature_set)} features for modeling.")

# --- 5. Main Analysis Loop ---
print("\n4. Running Baseline Models for Overall Sector Subsets...")

# Define Simple Models
baseline_models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5)
}

# Define Scenarios
scenarios = {"Overall": df}

for scenario_name, scenario_df in scenarios.items():
    
    # Create the 3 specific subsets
    subsets = {
        "All Board Types": scenario_df.copy(),
        "All-Men Board": scenario_df[scenario_df['corpgov_cganalyticboardfemale'] == 0].copy(),
        "Diverse Board": scenario_df[scenario_df['corpgov_cganalyticboardfemale'] > 0].copy()
    }
    
    for subset_name, subset_df in subsets.items():
        print(f"\nProcessing: {scenario_name} - {subset_name}")
        print(f"   Data Shape: {subset_df.shape}")
        
        if subset_df.shape[0] < 50:
            print("   Not enough data to model. Skipping.")
            continue
            
        # 1. Split
        X = subset_df.drop(columns=ALL_POSSIBLE_TARGETS, errors='ignore')
        y = subset_df[[TARGET_COL_NAME]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # 2. Preprocess
        X_train_proc, X_test_proc, y_train_proc, y_test_proc = preprocess_data(X_train, X_test, y_train, y_test)
        
        # 3. Select Features
        available_features = [f for f in final_feature_set if f in X_train_proc.columns and f in X_test_proc.columns]
        X_train_selected = X_train_proc[available_features]
        X_test_selected = X_test_proc[available_features]
        
        # 4. Scale (Required for Linear Regression & KNN)
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_selected), columns=available_features)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test_selected), columns=available_features)
        
        # 5. Train & Evaluate
        subset_results = []
        for model_name, model in baseline_models.items():
            # Fit
            model.fit(X_train_scaled, y_train_proc.values.ravel())
            
            # Predict
            y_pred = model.predict(X_test_scaled)
            
            # Metrics
            rmse = np.sqrt(mean_squared_error(y_test_proc, y_pred))
            mae = mean_absolute_error(y_test_proc, y_pred)
            r2 = r2_score(y_test_proc, y_pred)
            
            subset_results.append({
                'Model': model_name,
                'RMSE': rmse,
                'MAE': mae,
                'R2 Score': r2
            })
            
        # 6. Show Results
        results_df = pd.DataFrame(subset_results).set_index('Model').sort_values('RMSE')
        print(f"--- Results for {scenario_name} - {subset_name} ---")
        print(results_df.round(4))
        print("-" * 60)

print("\n--- Baseline Model Comparison Complete ---")

--- Starting Baseline Model Comparison for Reviewer Response ---
1. Loading Data...
   Dataset loaded successfully. Shape: (21496, 509)
2. Replicating Feature Engineering...
3. Performing global feature selection (same as proposed method)...
   Identified 30 features for modeling.

4. Running Baseline Models for Overall Sector Subsets...

Processing: Overall - All Board Types
   Data Shape: (21496, 513)
--- Results for Overall - All Board Types ---
                     RMSE     MAE  R2 Score
Model                                      
Linear Regression  1.5041  1.0392    0.9899
Decision Tree      1.7706  1.2520    0.9860
KNN                2.7960  1.9564    0.9652
------------------------------------------------------------

Processing: Overall - All-Men Board
   Data Shape: (3181, 513)
--- Results for Overall - All-Men Board ---
                     RMSE     MAE  R2 Score
Model                                      
Linear Regression  1.3432  0.9339    0.9939
Decision Tree      1.7051 