In [1]:
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import re

In [3]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
#for file in file_list: print(file)

In [4]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [5]:
#for col in sorted(df.columns): print(col)

In [6]:
#for col in sorted(df.columns): print(col)

In [7]:
target = 'Life expectancy at birth, total (years)'

cols_to_exclude = [
    #target and directly related
    'Life expectancy at birth, total (years)',
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'CountryShortName',
    'Year',
    #perfect correlate determined by correlation_to_all_lister.ipynb
    'Rural population (% of total population)' #'Urban population (% of total population)'
]


In [8]:
#function to remove special chars and spaces from feature names bc lightgbm can't handle them
def clean_col_name(col_name):
    col_name = col_name.replace('-', ' to ')
    col_name = col_name.replace('=', 'equals')
    col_name = col_name.replace('/', ' divided by ')
    col_name = col_name.replace('%', 'percent')
    col_name = col_name.replace('$', 'dollars')
    col_name = col_name.replace('&', 'and')
    col_name = col_name.replace(' ', '_')
    col_name = re.sub(r'[^\w\s]', '', col_name)
    col_name = col_name.strip()
    return col_name

In [9]:
target = clean_col_name(target)
cols_to_exclude = [clean_col_name(col) for col in cols_to_exclude]
df.columns = df.columns.map(clean_col_name)

In [10]:
#print(target)
#for col in cols_to_exclude: print(col)
#for col in sorted(df.columns): print(col)

In [11]:
#default hyperparameters (don't edit)
lgbm_defaults = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': None,
    'num_leaves': 31,
    'objective': None,
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0
 }
#from:
#model = LGBMRegressor()
#model.get_params()

In [12]:
#in production hyperparameters
lgbm_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': None,
    'random_state': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'verbose': 0
}

In [13]:
def lgbm_features(cols_to_exclude = cols_to_exclude, target = target, df = df, kfold_random_state = 1, n_neighbors = 5):
    X = df.drop(columns = cols_to_exclude)
    y = df[target]

    cv = KFold(n_splits=5, shuffle=True, random_state = kfold_random_state)

    mae_scores = []
    mse_scores = []
    r2_scores = []
    r2_train_scores = []
    feature_importance_list = []

    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        imputer = KNNImputer(n_neighbors = n_neighbors)
        X_train_imputed = imputer.fit_transform(X_train)
        X_val_imputed = imputer.transform(X_val)        
        X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
        X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)
        
        model = LGBMRegressor(**lgbm_params)
        model.fit(X_train_imputed, y_train)
        y_train_pred = model.predict(X_train_imputed)
        y_pred = model.predict(X_val_imputed)
        
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2_train = r2_score(y_train, y_train_pred)
        r2 = r2_score(y_val, y_pred)        
        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_train_scores.append(r2_train)
        r2_scores.append(r2)
        
        feature_importance_list.append(model.feature_importances_)


    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2_train = np.mean(r2_train_scores)
    avg_r2 = np.mean(r2_scores)

    print(f'Average MAE: {avg_mae:.4f}')
    print(f'Average MSE: {avg_mse:.4f}')
    print(f'Average Training R²: {avg_r2_train:.4f}')
    print(f'Average Validation R²: {avg_r2:.4f}')

    avg_feature_importance = np.mean(feature_importance_list, axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': avg_feature_importance
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    feature_importance_df['Rank'] = feature_importance_df.index + 1
    feature_importance_df = feature_importance_df[['Feature', 'Importance', 'Rank']]

    print('\nSorted Feature Importance:')
    print(feature_importance_df.to_string(index=False))

In [14]:
lgbm_features()

Average MAE: 0.2799
Average MSE: 0.1968
Average Training R²: 0.9997
Average Validation R²: 0.9972

Sorted Feature Importance:
                                                                                                              Feature  Importance  Rank
                                                                   Mortality_rate_adult_female_per_1000_female_adults       202.8     1
                                                                                     Death_rate_crude_per_1000_people       196.4     2
                                                                       Mortality_rate_adult_male_per_1000_male_adults       153.6     3
                                                                  Mortality_rate_under_to_5_male_per_1000_live_births        34.0     4
                                                         Population_ages_15_to_19_female_percent_of_female_population        31.6     5
                                                          

In [16]:
#drop high vif features found in vif_autoremover.ipynb
highVIFS = [
    'Population ages 0-14, total',
    'Self-employed, female (% of female employment) (modeled ILO estimate)',
    'Total greenhouse gas emissions excluding LULUCF (Mt CO2e)',
    'Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)',
    'Self-employed, male (% of male employment) (modeled ILO estimate)',
    'Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)',
    'Mortality rate, under-5 (per 1,000 live births)',
    'Employment to population ratio, ages 15-24, total (%) (modeled ILO estimate)',
    'Current health expenditure per capita (current US$)',
    'Mortality rate, infant (per 1,000 live births)',
    'Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)',
    'GDP, PPP (current international $)',
    'Employment to population ratio, 15+, total (%) (modeled ILO estimate)',
    'Wage and salaried workers, total (% of total employment) (modeled ILO estimate)',
    'Population ages 0-14, male',
    'Population ages 0-14 (% of total population)',
    'Total fisheries production (metric tons)',
    'Prevalence of anemia among women of reproductive age (% of women ages 15-49)',
    'Current health expenditure per capita, PPP (current international $)',
    'GNI (current US$)',
    'Share of youth not in education, employment or training, total (% of youth population) (modeled ILO estimate)',
    'Region_Europe & Central Asia',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Number of infant deaths',
    'Total natural resources rents (% of GDP)',
    'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)',
    'GDP (constant 2015 US$)',
    'Domestic credit to private sector by banks (% of GDP)',
    'Vulnerable employment, total (% of total employment) (modeled ILO estimate)',
    'Employment to population ratio, 15+, male (%) (modeled ILO estimate)',
    'GDP (current US$)',
    'Labor force, total',
    'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)',
    'Labor force participation rate for ages 15-24, total (%) (modeled ILO estimate)',
    'Population ages 0-14, female (% of female population)',
    'Self-employed, total (% of total employment) (modeled ILO estimate)',
    'Labor force participation rate, female (% of female population ages 15-64) (modeled ILO estimate)',
    'Population ages 0-14, female',
    'Nitrous oxide (N2O) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)',
    'GDP, PPP (constant 2021 international $)',
    'GNI, Atlas method (current US$)',
    'Number of deaths ages 10-14 years',
    'Wage and salaried workers, female (% of female employment) (modeled ILO estimate)',
    'School enrollment, primary (% gross)',
    'Mortality rate, infant, female (per 1,000 live births)',
    'Carbon dioxide (CO2) emissions from Industrial Processes (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Waste (Mt CO2e)',
    'Surface area (sq. km)',
    'GDP per capita, PPP (current international $)',
    'Employment to population ratio, ages 15-24, female (%) (modeled ILO estimate)',
    'Population ages 10-14, male (% of male population)',
    'Number of under-five deaths',
    'Carbon dioxide (CO2) emissions from Transport (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Power Industry (Energy) (Mt CO2e)',
    'Unemployment, total (% of total labor force) (modeled ILO estimate)',
    'Number of deaths ages 15-19 years',
    'Urban population',
    'Final consumption expenditure (constant 2015 US$)',
    'Access to clean fuels and technologies for cooking (% of population)',
    'Total greenhouse gas emissions including LULUCF (Mt CO2e)',
    'Labor force participation rate, total (% of total population ages 15-64) (modeled ILO estimate)',
    'GDP per capita, PPP (constant 2021 international $)',
    'Population ages 0-14, male (% of male population)',
    'Domestic private health expenditure per capita, PPP (current international $)',
    'Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e)',
    'GNI, PPP (current international $)',
    'Rural population',
    'Employment to population ratio, ages 15-24, male (%) (modeled ILO estimate)',
    'Carbon dioxide (CO2) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Mortality rate, infant, male (per 1,000 live births)',
    'Ratio of female to male labor force participation rate (%) (modeled ILO estimate)',
    'GNI per capita, Atlas method (current US$)',
    'Cereal production (metric tons)',
    'Domestic general government health expenditure per capita (current US$)',
    'Total alcohol consumption per capita, female (liters of pure alcohol, projected estimates, female 15+ years of age)',
    'Labor force participation rate, male (% of male population ages 15+) (modeled ILO estimate)',
    'Methane (CH4) emissions from Agriculture (Mt CO2e)',
    'Population ages 15-19, female (% of female population)',
    'GDP growth (annual %)',
    'Vulnerable employment, male (% of male employment) (modeled ILO estimate)',
    'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
    'Transport services (% of commercial service imports)',
    'Nitrous oxide (N2O) emissions from Building (Energy) (Mt CO2e)',
    'GDP per capita (current US$)',
    'Number of deaths ages 20-24 years',
    'Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e)',
    'Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e)',
    'Income Group_High income',
    'People using at least basic sanitation services (% of population)',
    'Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)',
    'Out-of-pocket expenditure per capita (current US$)',
    'Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e)',
    'GDP per capita (constant 2015 US$)',
    'Forest area (sq. km)',
    'Land under cereal production (hectares)',
    'Adjusted net national income (current US$)',
    'Number of deaths ages 5-9 years',
    'Computer, communications and other services (% of commercial service exports)',
    'Mortality rate, under-5, female (per 1,000 live births)',
    'Methane (CH4) emissions from Waste (Mt CO2e)',
    'Domestic general government health expenditure (% of GDP)',
    'Carbon dioxide (CO2) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Employment to population ratio, 15+, female (%) (modeled ILO estimate)',
    'Prevalence of anemia among pregnant women (%)',
    'Access to electricity (% of population)',
    'Nitrous oxide (N2O) emissions from Power Industry (Energy) (Mt CO2e)',
    'Vulnerable employment, female (% of female employment) (modeled ILO estimate)',
    'Mortality rate, adult, female (per 1,000 female adults)',
    'Birth rate, crude (per 1,000 people)',
    'Methane (CH4) emissions from Building (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Population ages 10-14, female (% of female population)',
    'Probability of dying among children ages 5-9 years (per 1,000)',
    'GNI per capita, PPP (current international $)',
    'Monetary Sector credit to private sector (% GDP)',
    'Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)',
    'Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e)',
    'Probability of dying among youth ages 20-24 years (per 1,000)',
    'GDP per person employed (constant 2021 PPP $)',
    'Domestic private health expenditure (% of current health expenditure)',
    'Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Unemployment, female (% of female labor force) (modeled ILO estimate)',
    'Computer, communications and other services (% of commercial service imports)',
    'People using at least basic drinking water services (% of population)',
    'Exports of goods, services and primary income (BoP, current US$)',
    'Domestic general government health expenditure per capita, PPP (current international $)',
    'People using at least basic sanitation services, urban (% of urban population)',
    'Labor force participation rate for ages 15-24, female (%) (modeled ILO estimate)',
    'Access to clean fuels and technologies for cooking, rural (% of rural population)',
    'Region_Sub-Saharan Africa',
    'Land area (sq. km)',
    'Mortality rate, neonatal (per 1,000 live births)',
    'Travel services (% of commercial service exports)',
    'General government final consumption expenditure (constant 2015 US$)',
    'People using at least basic drinking water services, urban (% of urban population)',
    'School enrollment, primary, male (% gross)',
    'Population ages 15-64 (% of total population)',
    'Labor force participation rate, male (% of male population ages 15-64) (modeled ILO estimate)',
    'Food production index (2014-2016 = 100)',
    'Immunization, DPT (% of children ages 12-23 months)',
    'Pillar 3 - Data Products - Score',
    'Labor force, female (% of total labor force)',
    'Access to electricity, urban (% of urban population)',
    'Population ages 15-19, male (% of male population)',
    'Import unit value index (2015 = 100)',
    'Immunization, measles (% of children ages 12-23 months)',
    'GOAL 15: Life on Land (5 year moving average)',
    'School enrollment, primary, female (% gross)',
    'Women Business and the Law Index Score (scale 1-100)',
    'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)',
    'Domestic general government health expenditure (% of current health expenditure)',
    'People using at least basic sanitation services, rural (% of rural population)',
    'Tuberculosis case detection rate (%, all forms)',
    'Access to clean fuels and technologies for cooking, urban (% of urban population)',
    'Death rate, crude (per 1,000 people)',
    'Livestock production index (2014-2016 = 100)',
    'Urban population (% of total population)',
    'Tuberculosis treatment success rate (% of new cases)',
    'Export unit value index (2015 = 100)',
    'GOAL 17: Partnerships to achieve the Goal (5 year moving average)',
    'Labor force participation rate for ages 15-24, male (%) (modeled ILO estimate)',
    'Mortality rate, adult, male (per 1,000 male adults)',
    'Access to electricity, rural (% of rural population)',
    'Crop production index (2014-2016 = 100)',
    'Compulsory education, duration (years)',
    'Carbon intensity of GDP (kg CO2e per 2021 PPP $)',
    'Share of youth not in education, employment or training, female (% of female youth population) (modeled ILO estimate)',
    'Prevalence of anemia among non-pregnant women (% of women ages 15-49)',
    'GOAL 1: No Poverty (5 year moving average)',
    'Current health expenditure (% of GDP)',
    'GOAL 10: Reduced Inequality (5 year moving average)',
    'Probability of dying among adolescents ages 15-19 years (per 1,000)',
    'Household Survey on income, etc (Availability score over 10 years)',
    'Adolescent fertility rate (births per 1,000 women ages 15-19)',
    'Renewable energy consumption (% of total final energy consumption)',
    'Agricultural land (% of land area)',
    'GOAL 16: Peace and Justice Strong Institutions (5 year moving average)',
    'Out-of-pocket expenditure per capita, PPP (current international $)',
    'Inflation, GDP deflator: linked series (annual %)',
    'Pillar 1 - Data Use - Score',
    'Unemployment, male (% of male labor force) (modeled ILO estimate)',
    'Adjusted savings: education expenditure (% of GNI)',
    'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
    'Carbon intensity of GDP (kg CO2e per constant 2021 US$ of GDP)',
    'Communications, computer, etc. (% of service imports, BoP)',
    'Adjusted net national income per capita (current US$)',
    'Prevalence of stunting, height for age (modeled estimate, % of children under 5)',
    'Agriculture, forestry, and fishing, value added (% of GDP)',
    'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
    'Broad money (% of GDP)',
    'Nitrous oxide (N2O) emissions from Industrial Processes (Mt CO2e)',
    'GOAL 11: Sustainable Cities and Communities (5 year moving average)',
    'Preprimary education, duration (years)',
    'Share of youth not in education, employment or training, male (% of male youth population) (modeled ILO estimate)',
    'Carbon dioxide (CO2) net fluxes from LULUCF - Forest Land (Mt CO2e)',
    'Exports of goods and services (% of GDP)',
    'Income Group_Lower middle income',
    'Probability of dying among adolescents ages 10-14 years (per 1,000)',
    'Aquaculture production (metric tons)',
    'Domestic credit to private sector (% of GDP)',
    'Out-of-pocket expenditure (% of current health expenditure)',
    'Number of neonatal deaths',
    'Health/Demographic survey (Availability score over 10 years)',
    'Total alcohol consumption per capita, male (liters of pure alcohol, projected estimates, male 15+ years of age)',
    'Oil rents (% of GDP)',
    'Methane (CH4) emissions from Transport (Energy) (Mt CO2e)',
    'Prevalence of overweight (modeled estimate, % of children under 5)',
    'International tourism, receipts for travel items (current US$)',
    'Urban population growth (annual %)',
    'Total greenhouse gas emissions excluding LULUCF (% change from 1990)',
    'Proportion of seats held by women in national parliaments (%)',
    'Total greenhouse gas emissions per capita excluding LULUCF (t CO2e/capita)',
    'Forest area (% of land area)',
    'GOAL 12: Responsible Consumption and Production (5 year moving average)',
    'Labor Force Survey (Availability score over 10 years)',
    'Lifetime risk of maternal death (%)',
    'Automated teller machines (ATMs) (per 100,000 adults)',
    'Nitrous oxide (N2O) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Fuel imports (% of merchandise imports)',
    'Communications, computer, etc. (% of service exports, BoP)'
]
highVIFS = [clean_col_name(col) for col in highVIFS]
new_cols_to_remove = list(set(cols_to_exclude + highVIFS))
lgbm_features(new_cols_to_remove)

Average MAE: 0.6447
Average MSE: 1.2235
Average Training R²: 0.9978
Average Validation R²: 0.9824

Sorted Feature Importance:
                                                                                            Feature  Importance  Rank
                                        Lifetime_risk_of_maternal_death_1_in_rate_varies_by_country       213.2     1
                                                        Incidence_of_tuberculosis_per_100000_people       114.8     2
                                   Domestic_private_health_expenditure_per_capita_current_USdollars        99.2     3
                                                                    Arable_land_hectares_per_person        83.8     4
                       Carbon_dioxide_CO2_emissions_total_excluding_LULUCF_percent_change_from_1990        83.6     5
                                                            Permanent_cropland_percent_of_land_area        79.0     6
Water_productivity_total_constant_2015_USdollars