In [None]:
import pandas as pd
import os 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [33]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import re

In [None]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
for file in file_list:
    print(file)

In [35]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [36]:
#for col in sorted(df.columns): print(col)

In [37]:
target = 'Life expectancy at birth, total (years)'

cols_to_exclude = [
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'Life expectancy at birth, total (years)',
    'CountryShortName',
    'Year',
    'Death rate, crude (per 1,000 people)',
    'Lifetime risk of maternal death (%)',
    'Lifetime risk of maternal death (1 in: rate varies by country)',
    'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
    'Mortality rate, adult, female (per 1,000 female adults)',
    'Mortality rate, adult, male (per 1,000 male adults)',
    'Mortality rate, infant (per 1,000 live births)',
    'Mortality rate, infant, female (per 1,000 live births)',
    'Mortality rate, infant, male (per 1,000 live births)',
    'Mortality rate, neonatal (per 1,000 live births)',
    'Mortality rate, under-5 (per 1,000 live births)',
    'Mortality rate, under-5, female (per 1,000 live births)',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Number of deaths ages 10-14 years',
    'Number of deaths ages 15-19 years',
    'Number of deaths ages 20-24 years',
    'Number of deaths ages 5-9 years',
    'Number of infant deaths',
    'Number of maternal deaths',
    'Number of neonatal deaths',
    'Number of under-five deaths',
    'Probability of dying among adolescents ages 10-14 years (per 1,000)',
    'Probability of dying among adolescents ages 15-19 years (per 1,000)',
    'Probability of dying among children ages 5-9 years (per 1,000)',
    'Probability of dying among youth ages 20-24 years (per 1,000)'
]


In [38]:
bag_params = {
    'bootstrap': True,
    'bootstrap_features': False,
    'estimator': DecisionTreeRegressor(),
    'max_features': 1.0,
    'max_samples': 1.0,
    'n_estimators': 10,
    'n_jobs': -1,
    'oob_score': False,
    'random_state': 1,
    'verbose': 0,
    'warm_start': False
 }
#got default parameters from running:
#model = BaggingRegressor()
#model.get_params()

In [39]:
def bagging_features(cols_to_exclude = cols_to_exclude, target = target, df = df, kfold_random_state = 1, n_neighbors = 5):
    X = df.drop(columns = cols_to_exclude)
    y = df[target]

    imputer = KNNImputer(n_neighbors = n_neighbors)
    model = BaggingRegressor(**bag_params)

    cv = KFold(n_splits=5, shuffle=True, random_state = kfold_random_state)

    mae_scores = []
    mse_scores = []
    r2_scores = []
    feature_importance_list = []

    for train_idx, val_idx in cv.split(X):
        #train_idx will be a NumPy array containing the integer indices of the rows in the original DataFrame X (and y) that should be used as the training set for the current fold.
        #val_idx will be a NumPy array containing the integer indices of the rows in X (and y) that should be used as the validation (or test) set for the current fold.

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        X_train_imputed = imputer.fit_transform(X_train)
        X_val_imputed = imputer.transform(X_val)
        
        X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
        X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)
        
        model.fit(X_train_imputed, y_train)
        y_pred = model.predict(X_val_imputed)

        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)        
        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_scores.append(r2)
        
        feature_importance_fold = np.mean([tree.feature_importances_ for tree in model.estimators_], axis=0)
        feature_importance_list.append(feature_importance_fold)


    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)

    print(f'Average MAE: {avg_mae:.4f}')
    print(f'Average MSE: {avg_mse:.4f}')
    print(f'Average R²: {avg_r2:.4f}')

    avg_feature_importance = np.mean(feature_importance_list, axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': avg_feature_importance
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    feature_importance_df['Rank'] = feature_importance_df.index + 1
    feature_importance_df = feature_importance_df[['Feature', 'Importance', 'Rank']]

    print('\nSorted Feature Importance:')
    print(feature_importance_df.to_string(index=False))

In [40]:
bagging_features()

Average MAE: 0.6619
Average MSE: 1.1397
Average R²: 0.9836

Sorted Feature Importance:
                                                                                                              Feature   Importance  Rank
                                                                              Access to electricity (% of population) 5.354115e-01     1
                                                    People using at least basic sanitation services (% of population) 9.459721e-02     2
                                                 Current health expenditure per capita, PPP (current international $) 9.095075e-02     3
                                                                                            Region_Sub-Saharan Africa 5.485148e-02     4
                                                                  Current health expenditure per capita (current US$) 4.219162e-02     5
                                                                       Incidence of tubercu