In [1]:
#this automates removal of the highest vif feature from the dataset until a vif threshold is achieved
#makes a list of the features that were dropped for having too high vif
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)
pd.options.display.float_format = '{:,.2f}'.format


In [None]:
import time
from sklearn.impute import KNNImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.options.display.float_format = '{:,.2e}'.format

In [3]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
#for file in file_list: print(file)

In [4]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [5]:
cols_to_exclude = [
    #target and directly related
    'Life expectancy at birth, total (years)',
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'CountryShortName',
    'Year',
    #perfect correlate determined by correlation_to_all_lister.ipynb
    'Rural population (% of total population)'
]


In [6]:
def vif_loop(df, vif_threshold = 10):
    #presumes dataset has no nans and no numerics
    print("Count: Time Elapsed: VIF Score: Feature Removed")
    excluded_features = []
    count = 1
    while True:
        start_time = time.time() 
        X = df.drop(columns = excluded_features)
        vif_data = pd.DataFrame()
        vif_data['Feature'] = X.columns
        vif_data['VIF'] = [
            variance_inflation_factor(X.values, i)
            for i in range(X.shape[1])
        ]
        vif_data.sort_values('VIF', ascending=False, inplace=True)
        
        highest_vif_row = vif_data.iloc[0]
        max_vif = float(highest_vif_row['VIF'])
        if max_vif < vif_threshold:
            break
        
        feature_to_remove = highest_vif_row['Feature']  
        end_time = time.time() 
        elapsed = end_time - start_time 
        print(f"#{count}: {elapsed:.2f}s: {max_vif:,.2f}: {feature_to_remove}")
        count += 1

        excluded_features.append(feature_to_remove)

    print(f"\nList of Features with High VIFs:")
    print("highVIFS = [")
    for feature in excluded_features:
        print(f"    '{feature}',")    
    print("]")

    print(f"\nRemaining Features and VIFs:")
    print(vif_data.to_string(index=False, float_format="{:,.2f}".format))
    
    return excluded_features

In [7]:
#drop any non-numeric columns first
df_imputed = df.drop(columns='CountryShortName')

#impute the rest of the dataset
imputer = KNNImputer()
df_imputed = pd.DataFrame(imputer.fit_transform(df_imputed), columns=df_imputed.columns, index=df_imputed.index)

#drop the target, non-numeric features, and any perfectly negative correlating features
df_imputed.drop(columns = cols_to_exclude, errors = 'ignore', inplace=True)

addl_cols_to_exclude = vif_loop(df_imputed, 5)

Count: Time Elapsed: VIF Score: Feature Removed
#1: 31.09s: 4,503,599,627,370,496.00: Population ages 0-14, total
#2: 30.33s: 468,119,649.86: Self-employed, female (% of female employment) (modeled ILO estimate)
#3: 25.52s: 188,022,245.02: Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
#4: 27.11s: 77,845,512.52: Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)
#5: 28.08s: 101,495,943.58: Self-employed, male (% of male employment) (modeled ILO estimate)
#6: 24.17s: 33,957,036.65: Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e)
#7: 24.65s: 10,078,483.80: Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)
#8: 27.49s: 871,551.05: Mortality rate, under-5 (per 1,000 live births)
#9: 28.87s: 610,805.28: Employment to population ratio, ages 15-24, total (%) (modeled ILO estimate)
#10: 26.52s: 478,382.47: Current health expenditure per capita (current US$)
#11: 23.64s: 381,508.41: Mortality rate, infant (per 1,000 live births)
#12: 23.29s: 22