In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_csv('D:/DSS5201/esg_data.csv')
data = data.drop(columns=['CompanyName', 'Industry', 'Sector'])  # Remove non-numeric columns

# Handle missing values (imputation with median)
imputer = SimpleImputer(strategy='median')  # Using median instead of mean
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Features and target variables
X = data.drop(columns=['ESG_Score', 'E_Score', 'S_Score', 'G_Score'])
y = data[['ESG_Score', 'E_Score', 'S_Score', 'G_Score']]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Keep 95% variance
X_scaled = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameter grids for Random Forest and XGBoost
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize models
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
ridge = Ridge()
lasso = Lasso()

# ---------------------------------------------------
# 1. Random Forest Model Training & Evaluation for G_Score and E_Score only
best_rf_models = {}

for target in ['G_Score', 'E_Score']:  # Only G_Score and E_Score for Random Forest
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
    grid_search_rf.fit(X_train, y_train[target])
    
    print(f"Random Forest - {target} Best Parameters:", grid_search_rf.best_params_)
    best_rf_models[target] = grid_search_rf.best_estimator_
    
    # Train and evaluate the model
    y_pred_rf = best_rf_models[target].predict(X_test)
    mse_rf = mean_squared_error(y_test[target], y_pred_rf)
    r2_rf = r2_score(y_test[target], y_pred_rf)
    
    print(f'Random Forest - {target} MSE: {mse_rf}')
    print(f'Random Forest - {target} R²: {r2_rf}')
    
    # Save the model
    joblib.dump(best_rf_models[target], f'best_rf_model_{target}.pkl')

# ---------------------------------------------------
# 2. XGBoost Model Training & Evaluation for All Targets
best_xgb_models = {}

for target in y.columns:
    grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
    grid_search_xgb.fit(X_train, y_train[target])
    
    print(f"XGBoost - {target} Best Parameters:", grid_search_xgb.best_params_)
    best_xgb_models[target] = grid_search_xgb.best_estimator_
    
    # Train and evaluate the model
    y_pred_xgb = best_xgb_models[target].predict(X_test)
    mse_xgb = mean_squared_error(y_test[target], y_pred_xgb)
    r2_xgb = r2_score(y_test[target], y_pred_xgb)
    
    print(f'XGBoost - {target} MSE: {mse_xgb}')
    print(f'XGBoost - {target} R²: {r2_xgb}')
    
    # Save the model
    joblib.dump(best_xgb_models[target], f'best_xgb_model_{target}.pkl')

# ---------------------------------------------------
# 3. Fusion Model: Weighted Average of Random Forest & XGBoost Predictions
weights_rf = 0.5  # Adjust weight for Random Forest
weights_xgb = 0.5  # Adjust weight for XGBoost

y_pred_fusion = {}
for target in y.columns:
    # Only use Random Forest for G_Score and E_Score
    rf_model = best_rf_models.get(target, None)
    xgb_model = best_xgb_models[target]
    
    # Ensure that the Random Forest model is only included if it exists
    if rf_model:
        y_pred_fusion[target] = weights_rf * rf_model.predict(X_test) + weights_xgb * xgb_model.predict(X_test)
    else:
        y_pred_fusion[target] = weights_xgb * xgb_model.predict(X_test)
    
    # Evaluate the fusion model
    mse_fusion = mean_squared_error(y_test[target], y_pred_fusion[target])
    r2_fusion = r2_score(y_test[target], y_pred_fusion[target])
    
    print(f'Fusion Model - {target} MSE: {mse_fusion}')
    print(f'Fusion Model - {target} R²: {r2_fusion}')

# ---------------------------------------------------
# 4. Voting Regressor: Combine Models Using Voting
# Include only models that are available for the ESG_Score
voting_estimators = []

# Add Random Forest for ESG_Score if it's trained
if 'ESG_Score' in best_rf_models:
    voting_estimators.append(('rf', best_rf_models['ESG_Score']))

# Always add XGBoost for ESG_Score
voting_estimators.append(('xgb', best_xgb_models['ESG_Score']))

# Also add Ridge and Lasso as additional models
voting_estimators.append(('ridge', ridge))
voting_estimators.append(('lasso', lasso))

# Create the Voting Regressor with the selected models
voting_model = VotingRegressor(estimators=voting_estimators)

# Train the Voting Regressor only on ESG_Score for now
voting_model.fit(X_train, y_train['ESG_Score'])  # Train only on ESG_Score for voting model

y_pred_voting = voting_model.predict(X_test)
mse_voting = mean_squared_error(y_test['ESG_Score'], y_pred_voting)
r2_voting = r2_score(y_test['ESG_Score'], y_pred_voting)

print(f'Voting Regressor - ESG_Score MSE: {mse_voting}')
print(f'Voting Regressor - ESG_Score R²: {r2_voting}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Random Forest - G_Score Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest - G_Score MSE: 0.12345463708504927
Random Forest - G_Score R²: 0.49971242704295404
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Random Forest - E_Score Best Parameters: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest - E_Score MSE: 0.48063745192218976
Random Forest - E_Score R²: 0.2153040947254108
Fitting 5 folds for each of 162 candidates, totalling 810 fits
XGBoost - ESG_Score Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.9}
XGBoost - ESG_Score MSE: 0.5777116418163732
XGBoost - ESG_Score R²: 0.1309139614085023
Fitting 5 folds for each of 162 candidates, totalling 810 fits
XGBoost - E_Score Best Parameters: {

: 

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1. Load Data
# Original financial data
financial_data = pd.DataFrame({
    "CompanyName": ["Johnson & Johnson", "Johnson & Johnson", "Johnson & Johnson", "Boston Scientific Corporation", 
                    "Boston Scientific Corporation", "Boston Scientific Corporation", "UnitedHealth Group Inc.", 
                    "UnitedHealth Group Inc.", "UnitedHealth Group Inc."],
    "Industry": ["Drug Manufacturers - Major", "Drug Manufacturers - Major", "Drug Manufacturers - Major", 
                 "Medical Appliances & Equipment", "Medical Appliances & Equipment", "Medical Appliances & Equipment", 
                 "Health Care Plans", "Health Care Plans", "Health Care Plans"],
    "Sector": ["Healthcare"] * 9,
    "Year": [2022, 2021, 2020, 2022, 2021, 2020, 2022, 2021, 2020],
    "AnnualEarning_Billion": [15.83, 21.35, 22.96, 1.99, 1.14, 1.08, 29.11, 26.34, 22.31],
    "MarketCap_Billion": [461.84, 450.35, 414.3, 84.69, 66.27, 60.53, 495.37, 472.94, 332.73],
    "Revenue": [85.16, 91.15, 93.78, 14.24, 12.68, 11.89, 367.53, 322.13, 285.27],
    "Expenses": [69.33, 69.8, 70.82, 12.25, 11.54, 10.81, 338.42, 295.79, 262.96]
})

# Load ESG data
esg_data = pd.read_csv('D:/DSS5201/esg_data.csv')

# 2. Merge Datasets
combined_data = pd.merge(financial_data, esg_data[['CompanyName', 'Year', 'ESG_Score']], on=['CompanyName', 'Year'], how='inner')

# VaR Calculation
market_cap_returns = combined_data['MarketCap_Billion'].pct_change().dropna()
confidence_level = 0.95
var_95 = np.percentile(market_cap_returns, (1 - confidence_level) * 100)
print("Value at Risk (VaR) at 95% confidence level:", var_95)

# ESG-Adjusted Beta
initial_beta = 1.2
average_esg_score = combined_data['ESG_Score'].mean()
esg_adjusted_beta = initial_beta * (1 - average_esg_score / 100)
print("ESG Adjusted Beta:", esg_adjusted_beta)


Value at Risk (VaR) at 95% confidence level: -0.6208915258450635
ESG Adjusted Beta: 1.0187199999999998
