In [1]:
#this version uses groups of features instead of all features
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import re

In [3]:
folder_loc = 'F:/class/BANA 698/week 8'

file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
#for file in file_list: print(file)

In [4]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [5]:
#for col in sorted(df.columns): print(col)

In [6]:
target = 'Life expectancy at birth, total (years)'

cols_to_exclude = [
    #target and directly related
    'Life expectancy at birth, total (years)',
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'CountryShortName',
    'Year',
    #perfect correlate determined by correlation_to_all_lister.ipynb
    'Rural population (% of total population)' #'Urban population (% of total population)'
]


In [7]:
#function to remove special chars and spaces from feature names bc lightgbm can't handle them
def clean_col_name(col_name):
    col_name = col_name.replace('-', ' to ')
    col_name = col_name.replace('=', 'equals')
    col_name = col_name.replace('/', ' divided by ')
    col_name = col_name.replace('%', 'percent')
    col_name = col_name.replace('$', 'dollars')
    col_name = col_name.replace('&', 'and')
    col_name = col_name.replace(' ', '_')
    col_name = re.sub(r'[^\w\s]', '', col_name)
    col_name = col_name.strip()
    return col_name

In [8]:
target = clean_col_name(target)
cols_to_exclude = [clean_col_name(col) for col in cols_to_exclude]
df.columns = df.columns.map(clean_col_name)

In [9]:
#print(target)
#for col in cols_to_exclude: print(col)
#for col in sorted(df.columns): print(col)

In [10]:
#default hyperparameters (don't edit)
lgbm_defaults = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': None,
    'num_leaves': 31,
    'objective': None,
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0
 }
#from:
#model = LGBMRegressor()
#model.get_params()

In [11]:
#in production hyperparameters
lgbm_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': None,
    'random_state': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'verbose': 0
}

In [12]:
def lgbm_features(cols_to_exclude = cols_to_exclude, target = target, df = df, kfold_random_state = 1, n_neighbors = 5):
    X = df.drop(columns = cols_to_exclude)
    y = df[target]

    cv = KFold(n_splits=5, shuffle=True, random_state = kfold_random_state)

    mae_scores = []
    mse_scores = []
    r2_scores = []
    r2_train_scores = []
    feature_importance_list = []

    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        imputer = KNNImputer(n_neighbors = n_neighbors)
        X_train_imputed = imputer.fit_transform(X_train)
        X_val_imputed = imputer.transform(X_val)        
        X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
        X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)
        
        model = LGBMRegressor(**lgbm_params)
        model.fit(X_train_imputed, y_train)
        y_train_pred = model.predict(X_train_imputed)
        y_pred = model.predict(X_val_imputed)
        
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2_train = r2_score(y_train, y_train_pred)
        r2 = r2_score(y_val, y_pred)        
        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_train_scores.append(r2_train)
        r2_scores.append(r2)
        
        feature_importance_list.append(model.feature_importances_)


    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2_train = np.mean(r2_train_scores)
    avg_r2 = np.mean(r2_scores)

    print(f'Average MAE: {avg_mae:.4f}')
    print(f'Average MSE: {avg_mse:.4f}')
    print(f'Average Training R²: {avg_r2_train:.4f}')
    print(f'Average Validation R²: {avg_r2:.4f}')

    avg_feature_importance = np.mean(feature_importance_list, axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': avg_feature_importance
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    feature_importance_df['Rank'] = feature_importance_df.index + 1
    feature_importance_df = feature_importance_df[['Feature', 'Importance', 'Rank']]

    print('\nSorted Feature Importance:')
    print(feature_importance_df.to_string(index=False))

In [13]:
#first run, with all features
lgbm_features()

Average MAE: 0.2799
Average MSE: 0.1968
Average Training R²: 0.9997
Average Validation R²: 0.9972

Sorted Feature Importance:
                                                                                                              Feature  Importance  Rank
                                                                   Mortality_rate_adult_female_per_1000_female_adults       202.8     1
                                                                                     Death_rate_crude_per_1000_people       196.4     2
                                                                       Mortality_rate_adult_male_per_1000_male_adults       153.6     3
                                                                  Mortality_rate_under_to_5_male_per_1000_live_births        34.0     4
                                                         Population_ages_15_to_19_female_percent_of_female_population        31.6     5
                                                          

In [14]:
#run only 1 subgroup of features at a time
file = 'Field Mapping List.csv'
df2 = pd.read_csv(os.path.join(folder_loc, file))
print(df2['Indicator Category'].unique())

['Environmental' 'Health' 'Socioeconomic']


In [15]:
df2['Field Name'] = df2['Field Name'].apply(clean_col_name)

In [16]:
feats_all = df.columns.tolist()
cols_to_exclude_except_enviro = list(set(df2[df2['Indicator Category'] != 'Environmental']['Field Name'].tolist() + cols_to_exclude))
cols_to_exclude_except_health = list(set(df2[df2['Indicator Category'] != 'Health']['Field Name'].tolist() + cols_to_exclude))
cols_to_exclude_except_socio = list(set(df2[df2['Indicator Category'] != 'Socioeconomic']['Field Name'].tolist() + cols_to_exclude))
print(f"{len(feats_all)} total features")
print(f"{len(feats_all) - len(cols_to_exclude_except_enviro)} environmental features")
print(f"{len(feats_all) - len(cols_to_exclude_except_health)} health features")
print(f"{len(feats_all) - len(cols_to_exclude_except_socio)} socioeconomic features")

print(cols_to_exclude_except_enviro)
print(cols_to_exclude_except_health)
print(cols_to_exclude_except_socio)

286 total features
80 environmental features
73 health features
127 socioeconomic features
['Number_of_deaths_ages_20_to_24_years', 'Wage_and_salaried_workers_total_percent_of_total_employment_modeled_ILO_estimate', 'Employment_to_population_ratio_15_female_percent_modeled_ILO_estimate', 'Final_consumption_expenditure_annual_percent_growth', 'Population_ages_15_to_19_female_percent_of_female_population', 'Region_South_Asia', 'Out_to_of_to_pocket_expenditure_per_capita_PPP_current_international_dollars', 'Communications_computer_etc_percent_of_service_exports_BoP', 'Vulnerable_employment_male_percent_of_male_employment_modeled_ILO_estimate', 'Probability_of_dying_among_adolescents_ages_10_to_14_years_per_1000', 'Total_alcohol_consumption_per_capita_male_liters_of_pure_alcohol_projected_estimates_male_15_years_of_age', 'Mortality_rate_infant_per_1000_live_births', 'International_tourism_receipts_for_travel_items_current_USdollars', 'Domestic_private_health_expenditure_per_capita_PPP_curr

## Environmental Features

In [17]:
#just the environmental features
lgbm_features(cols_to_exclude_except_enviro)

Average MAE: 0.6578
Average MSE: 1.1022
Average Training R²: 0.9977
Average Validation R²: 0.9840

Sorted Feature Importance:
                                                                                            Feature  Importance  Rank
                                                 Livestock_production_index_2014_to_2016_equals_100       102.6     1
                                                        Access_to_electricity_percent_of_population        78.8     2
                                                      Crop_production_index_2014_to_2016_equals_100        78.0     3
                                            Access_to_electricity_urban_percent_of_urban_population        77.2     4
                                                                    Arable_land_hectares_per_person        76.6     5
                         Energy_intensity_level_of_primary_energy_MJ_divided_by_dollars2017_PPP_GDP        74.8     6
                                                

In [18]:
#also exclude high correlation environmental features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'Total greenhouse gas emissions including LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)',
    'Cereal production (metric tons)',
    'Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e)',
    'Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Power Industry (Energy) (Mt CO2e)',
    'Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Land under cereal production (hectares)',
    'Methane (CH4) emissions from Waste (Mt CO2e)',
    'Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Agriculture (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Industrial Processes (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e)',
    'Aquaculture production (metric tons)',
    'Total fisheries production (metric tons)',
    'Nitrous oxide (N2O) emissions from Power Industry (Energy) (Mt CO2e)',
    'Forest area (sq. km)',
    'Nitrous oxide (N2O) emissions from Waste (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Building (Energy) (Mt CO2e)',
    'Access to clean fuels and technologies for cooking (% of population)',
    'Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Transport (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Transport (Energy) (Mt CO2e)',
    'Land area (sq. km)',
    'Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)',
    'Nitrous oxide (N2O) emissions from Industrial Processes (Mt CO2e)',
    'Methane (CH4) emissions from Building (Energy) (Mt CO2e)',
    'Access to electricity (% of population)',
    'Rural population (% of total population)',
    'Total greenhouse gas emissions excluding LULUCF (Mt CO2e)',
    'Access to electricity, rural (% of rural population)'  
]
addl_cols_to_exclude = [clean_col_name(col) for col in addl_cols_to_exclude]
new_cols_to_exclude_except_enviro = list(set(cols_to_exclude_except_enviro + addl_cols_to_exclude))
lgbm_features(new_cols_to_exclude_except_enviro)

Average MAE: 0.6805
Average MSE: 1.1771
Average Training R²: 0.9973
Average Validation R²: 0.9831

Sorted Feature Importance:
                                                                                            Feature  Importance  Rank
                                            Access_to_electricity_urban_percent_of_urban_population       135.4     1
                                                 Livestock_production_index_2014_to_2016_equals_100       118.8     2
Water_productivity_total_constant_2015_USdollars_GDP_per_cubic_meter_of_total_freshwater_withdrawal       115.0     3
                                                                    Arable_land_hectares_per_person        96.8     4
                                 PM25_air_pollution_mean_annual_exposure_micrograms_per_cubic_meter        89.4     5
                                                      Crop_production_index_2014_to_2016_equals_100        89.2     6
                                                

## Health Features

In [19]:
#just the health features
lgbm_features(cols_to_exclude_except_health)

Average MAE: 0.3041
Average MSE: 0.2230
Average Training R²: 0.9994
Average Validation R²: 0.9968

Sorted Feature Importance:
                                                                                                      Feature  Importance  Rank
                                                                             Death_rate_crude_per_1000_people       252.8     1
                                                           Mortality_rate_adult_female_per_1000_female_adults       250.2     2
                                                               Mortality_rate_adult_male_per_1000_male_adults       217.4     3
                                                                  Incidence_of_tuberculosis_per_100000_people        99.4     4
                           Prevalence_of_stunting_height_for_age_modeled_estimate_percent_of_children_under_5        70.2     5
                                                         Population_ages_15_to_64_percent_of_total_populat

In [20]:
#also exclude high correlation health features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'Life expectancy at birth, female (years)',
    'Number of deaths ages 15-19 years',
    'Mortality rate, infant (per 1,000 live births)',
    'Life expectancy at birth, total (years)',
    'Number of deaths ages 10-14 years',
    'Mortality rate, infant, female (per 1,000 live births)',
    'Number of deaths ages 20-24 years',
    'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Mortality rate, infant, male (per 1,000 live births)',
    'Population ages 0-14 (% of total population)',
    'Mortality rate, under-5 (per 1,000 live births)',
    'Birth rate, crude (per 1,000 people)',
    'Population ages 0-14, female (% of female population)',
    'Mortality rate, neonatal (per 1,000 live births)',
    'Mortality rate, under-5, female (per 1,000 live births)',
    'Life expectancy at birth, male (years)',
    'Number of deaths ages 5-9 years',
    'Current health expenditure per capita (current US$)',
    'Out-of-pocket expenditure per capita (current US$)',
    'Population ages 0-14, male (% of male population)',
    'Population ages 10-14, female (% of female population)',
    'Population ages 0-14, female',
    'Population ages 10-14, male (% of male population)',
    'Prevalence of anemia among non-pregnant women (% of women ages 15-49)',
    'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)',
    'People using at least basic sanitation services, rural (% of rural population)',
    'Number of infant deaths',
    'Current health expenditure per capita, PPP (current international $)',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
    'Out-of-pocket expenditure (% of current health expenditure)',
    'Population ages 15-19, female (% of female population)',
    'Probability of dying among adolescents ages 15-19 years (per 1,000)',
    'Probability of dying among adolescents ages 10-14 years (per 1,000)',
    'Prevalence of anemia among pregnant women (%)',
    'Domestic general government health expenditure per capita (current US$)',
    'Domestic private health expenditure per capita (current US$)',
    'Lifetime risk of maternal death (%)',
    'Immunization, DPT (% of children ages 12-23 months)',
    'People using at least basic drinking water services, urban (% of urban population)',
    'Population ages 0-14, male',
    'Mortality rate, adult, female (per 1,000 female adults)',
    'Number of maternal deaths',
    'People using at least basic sanitation services, urban (% of urban population)',
    'Number of neonatal deaths',
    'Total alcohol consumption per capita, female (liters of pure alcohol, projected estimates, female 15+ years of age)'  
]
addl_cols_to_exclude = [clean_col_name(col) for col in addl_cols_to_exclude]
new_cols_to_exclude_except_health = list(set(cols_to_exclude_except_health + addl_cols_to_exclude))
lgbm_features(new_cols_to_exclude_except_health)

Average MAE: 0.3599
Average MSE: 0.3071
Average Training R²: 0.9992
Average Validation R²: 0.9956

Sorted Feature Importance:
                                                                                                  Feature  Importance  Rank
                                                           Mortality_rate_adult_male_per_1000_male_adults       334.2     1
                                                                         Death_rate_crude_per_1000_people       301.4     2
                                           Probability_of_dying_among_children_ages_5_to_9_years_per_1000       142.8     3
                                                     Population_ages_15_to_64_percent_of_total_population       133.2     4
                                                              Incidence_of_tuberculosis_per_100000_people       120.2     5
                                                 Population_ages_15_to_19_male_percent_of_male_population       118.0     6
      

## Socioeconomic Features

In [21]:
#just the socioeconomic features
lgbm_features(cols_to_exclude_except_socio)

Average MAE: 0.6966
Average MSE: 1.1699
Average Training R²: 0.9981
Average Validation R²: 0.9832

Sorted Feature Importance:
                                                                                                              Feature  Importance  Rank
                                                                                    Pillar_1__to__Data_Use__to__Score        62.8     1
                                                              Official_exchange_rate_LCU_per_USdollars_period_average        62.6     2
                                                                              Import_unit_value_index_2015_equals_100        62.4     3
                                                                 Refugee_population_by_country_or_territory_of_origin        59.0     4
                                                                     Adjusted_savings_energy_depletion_percent_of_GNI        58.4     5
                                                          

In [22]:
#also exclude high correlation socioeconomic features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'General government final consumption expenditure (constant 2015 US$)',
    'Self-employed, female (% of female employment) (modeled ILO estimate)',
    'Self-employed, male (% of male employment) (modeled ILO estimate)',
    'Adjusted net national income (current US$)',
    'Final consumption expenditure (constant 2015 US$)',
    'Self-employed, total (% of total employment) (modeled ILO estimate)',
    'GDP, PPP (constant 2021 international $)',
    'GDP per capita, PPP (constant 2021 international $)',
    'GNI, PPP (current international $)',
    'Vulnerable employment, female (% of female employment) (modeled ILO estimate)',
    'Adjusted net national income per capita (current US$)',
    'GDP, PPP (current international $)',
    'Vulnerable employment, male (% of male employment) (modeled ILO estimate)',
    'Exports of goods, services and primary income (BoP, current US$)',
    'GDP per capita (constant 2015 US$)',
    'Unemployment, male (% of male labor force) (modeled ILO estimate)',
    'Unemployment, female (% of female labor force) (modeled ILO estimate)',
    'Vulnerable employment, total (% of total employment) (modeled ILO estimate)',
    'GDP (constant 2015 US$)',
    'GNI, Atlas method (current US$)',
    'Employment to population ratio, 15+, female (%) (modeled ILO estimate)',
    'Unemployment, total (% of total labor force) (modeled ILO estimate)',
    'Domestic credit to private sector (% of GDP)',
    'GNI per capita, PPP (current international $)',
    'GDP per capita, PPP (current international $)',
    'Employment to population ratio, ages 15-24, male (%) (modeled ILO estimate)',
    'Employment to population ratio, ages 15-24, female (%) (modeled ILO estimate)',
    'Wage and salaried workers, female (% of female employment) (modeled ILO estimate)',
    'School enrollment, primary, female (% gross)',
    'Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)',
    'Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)',
    'School enrollment, primary, male (% gross)',
    'Share of youth not in education, employment or training, female (% of female youth population) (modeled ILO estimate)',
    'Communications, computer, etc. (% of service exports, BoP)',
    'Communications, computer, etc. (% of service imports, BoP)',
    'Labor force participation rate for ages 15-24, female (%) (modeled ILO estimate)',
    'Primary education, pupils',
    'Monetary Sector credit to private sector (% GDP)',
    'Labor force, female (% of total labor force)',
    'Labor force participation rate, female (% of female population ages 15-64) (modeled ILO estimate)',
    'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)',
    'Labor force participation rate for ages 15-24, male (%) (modeled ILO estimate)',
    'Inflation, GDP deflator (annual %)',
    'GNI per capita, Atlas method (current US$)',
    'GNI (current US$)',
    'GDP growth (annual %)',
    'Employment to population ratio, ages 15-24, total (%) (modeled ILO estimate)',
    'Employment to population ratio, 15+, total (%) (modeled ILO estimate)',
    'Employment to population ratio, 15+, male (%) (modeled ILO estimate)',
    'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)'  
]
addl_cols_to_exclude = [clean_col_name(col) for col in addl_cols_to_exclude]
new_cols_to_exclude_except_socio = list(set(cols_to_exclude_except_socio + addl_cols_to_exclude))
lgbm_features(new_cols_to_exclude_except_socio)

Average MAE: 0.7531
Average MSE: 1.3487
Average Training R²: 0.9978
Average Validation R²: 0.9806

Sorted Feature Importance:
                                                                                                          Feature  Importance  Rank
                                 Wage_and_salaried_workers_total_percent_of_total_employment_modeled_ILO_estimate        83.4     1
                                                          Official_exchange_rate_LCU_per_USdollars_period_average        77.6     2
                                                                                 GDP_per_capita_current_USdollars        76.4     3
                                                             Refugee_population_by_country_or_territory_of_origin        76.4     4
                                                        Domestic_credit_to_private_sector_by_banks_percent_of_GDP        76.0     5
                                                                 Adjusted_savings_