In [None]:
import pandas as pd
import os 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [24]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import re

In [None]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
for file in file_list:
    print(file)

In [26]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [27]:
#for col in sorted(df.columns): print(col)

In [28]:
#remove special chars and spaces from feature names bc lightgbm can't handle them
df.columns = df.columns.str.replace('-', ' to ')
df.columns = df.columns.str.replace('=', 'equals')
df.columns = df.columns.str.replace('/', ' divided by ')
df.columns = df.columns.str.replace('%', 'percent', regex=False)
df.columns = df.columns.str.replace('$', 'dollar', regex=False)
df.columns = df.columns.str.replace('&', 'and')
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)
df.columns = df.columns.str.strip()

In [29]:
#for col in sorted(df.columns): print(col)

In [30]:
target = 'Life_expectancy_at_birth_total_years'

cols_to_exclude = [
    'Life_expectancy_at_birth_female_years',
    'Life_expectancy_at_birth_male_years',
    'Life_expectancy_at_birth_total_years',
    'CountryShortName',
    'Year',
    'Death_rate_crude_per_1000_people',
    'Lifetime_risk_of_maternal_death_1_in_rate_varies_by_country',
    'Lifetime_risk_of_maternal_death_percent',
    'Mortality_rate_adult_female_per_1000_female_adults',
    'Mortality_rate_adult_male_per_1000_male_adults',
    'Mortality_rate_infant_female_per_1000_live_births',
    'Mortality_rate_infant_male_per_1000_live_births',
    'Mortality_rate_infant_per_1000_live_births',
    'Mortality_rate_neonatal_per_1000_live_births',
    'Mortality_rate_under_to_5_female_per_1000_live_births',
    'Mortality_rate_under_to_5_male_per_1000_live_births',
    'Mortality_rate_under_to_5_per_1000_live_births',
    'Number_of_deaths_ages_10_to_14_years',
    'Number_of_deaths_ages_15_to_19_years',
    'Number_of_deaths_ages_20_to_24_years',
    'Number_of_deaths_ages_5_to_9_years',
    'Number_of_infant_deaths',
    'Number_of_maternal_deaths',
    'Number_of_neonatal_deaths',
    'Number_of_under_to_five_deaths',
    'Maternal_mortality_ratio_modeled_estimate_per_100000_live_births',
    'Probability_of_dying_among_adolescents_ages_10_to_14_years_per_1000',
    'Probability_of_dying_among_adolescents_ages_15_to_19_years_per_1000',
    'Probability_of_dying_among_children_ages_5_to_9_years_per_1000',
    'Probability_of_dying_among_youth_ages_20_to_24_years_per_1000'
]

In [31]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': None,
    'random_state': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'verbose': 0
}
#got default parameters from running:
#model = LGBMRegressor()
#model.get_params()

In [32]:
def lgbm_features(cols_to_exclude = cols_to_exclude, target = target, df = df, kfold_random_state = 1, n_neighbors = 5):
    X = df.drop(columns = cols_to_exclude)
    y = df[target]

    imputer = KNNImputer(n_neighbors = n_neighbors)
    model = LGBMRegressor(**lgbm_params)

    cv = KFold(n_splits=5, shuffle=True, random_state = kfold_random_state)

    mae_scores = []
    mse_scores = []
    r2_scores = []
    feature_importance_list = []

    for train_idx, val_idx in cv.split(X):
        #train_idx will be a NumPy array containing the integer indices of the rows in the original DataFrame X (and y) that should be used as the training set for the current fold.
        #val_idx will be a NumPy array containing the integer indices of the rows in X (and y) that should be used as the validation (or test) set for the current fold.

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        X_train_imputed = imputer.fit_transform(X_train)
        X_val_imputed = imputer.transform(X_val)
        
        X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
        X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)
        
        model.fit(X_train_imputed, y_train)
        y_pred = model.predict(X_val_imputed)
        
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)        
        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_scores.append(r2)
        
        feature_importance_list.append(model.feature_importances_)


    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)

    print(f'Average MAE: {avg_mae:.4f}')
    print(f'Average MSE: {avg_mse:.4f}')
    print(f'Average R²: {avg_r2:.4f}')

    avg_feature_importance = np.mean(feature_importance_list, axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': avg_feature_importance
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    feature_importance_df['Rank'] = feature_importance_df.index + 1
    feature_importance_df = feature_importance_df[['Feature', 'Importance', 'Rank']]

    print('\nSorted Feature Importance:')
    print(feature_importance_df.to_string(index=False))

In [33]:
lgbm_features()

Average MAE: 0.5289
Average MSE: 0.7219
Average R²: 0.9896

Sorted Feature Importance:
                                                                                                              Feature  Importance  Rank
                                                                          Incidence_of_tuberculosis_per_100000_people        56.0     1
                                               Insurance_and_financial_services_percent_of_commercial_service_imports        46.6     2
                                                                 Prevalence_of_undernourishment_percent_of_population        45.4     3
                                                                              Import_unit_value_index_2015_equals_100        45.0     4
                                                                                     Birth_rate_crude_per_1000_people        44.4     5
                                            Energy_intensity_level_of_primary_energy_MJ_divided_b