In [1]:
#this calculates vif in cross-validation where imputation is performed per fold and vif as a one-pass after imputation of the entire dataset
#then compares the results to see if there's massive differences between cv and one-pass (there are)
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
import time
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
#for file in file_list: print(file)

In [4]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [None]:
cols_to_exclude = [
    #target and directly related
    'Life expectancy at birth, total (years)',
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'CountryShortName',
    'Year',
    #perfect correlate determined by correlation_to_all_lister.ipynb
    'Rural population (% of total population)' #'Urban population (% of total population)'
]


### VIF with Cross-Validation

In [6]:
def vif_cv(cols_to_exclude = cols_to_exclude, df = df, n_splits=5, n_neighbors=5, random_state=1):
    X = df.drop(columns = cols_to_exclude)

    vif_scores_per_fold = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    counter = 1

    for train_index, _ in kf.split(X):
        start_time = time.time()       
        
        X_train = X.iloc[train_index]

        imputer = KNNImputer(n_neighbors=n_neighbors)
        X_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X.columns, index=X_train.index)

        vif_data = pd.DataFrame()
        vif_data['Feature'] = X_imputed.columns
        vif_data['VIF'] = [
            variance_inflation_factor(X_imputed.values, i)
            for i in range(X_imputed.shape[1])
        ]
        vif_scores_per_fold.append(vif_data['VIF'])
        
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"Fold #{counter} completed in {elapsed:.2f} seconds")
        counter += 1

    all_vifs = pd.concat(vif_scores_per_fold, axis=1)
    all_vifs.columns = [f'Fold_{i+1}' for i in range(n_splits)]
    all_vifs['VIF_mean'] = all_vifs.mean(axis=1)
    all_vifs.insert(0, 'Feature', X.columns)
    all_vifs.sort_values('VIF_mean', ascending=False, inplace=True)
    all_vifs.reset_index(drop=True, inplace=True)
    all_vifs['#'] = all_vifs.index + 1

    return all_vifs[['Feature', 'VIF_mean', '#']]

In [7]:
vif_result_cv = vif_cv()
print(vif_result_cv.to_string(index=False, float_format="{:,.2f}".format))

Fold #1 completed in 24.22 seconds
Fold #2 completed in 30.48 seconds
Fold #3 completed in 28.09 seconds
Fold #4 completed in 22.75 seconds
Fold #5 completed in 25.77 seconds
                                                                                                              Feature                 VIF_mean   #
                                                                                          Population ages 0-14, total 5,404,319,552,844,595.00   1
                                                                                           Population ages 0-14, male 1,561,247,870,821,771.75   2
                                                                                         Population ages 0-14, female 1,329,634,175,699,860.75   3
                                                Self-employed, female (% of female employment) (modeled ILO estimate)           554,406,628.57   4
                                    Wage and salaried workers, female (% of female employm

### VIF with One-Pass Only

In [8]:
def vif_one_pass(cols_to_exclude = cols_to_exclude, df = df, n_neighbors=5):
    X = df.drop(columns = cols_to_exclude)

    imputer = KNNImputer(n_neighbors=n_neighbors)
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

    vif_data = pd.DataFrame()
    vif_data['Feature'] = X_imputed.columns
    vif_data['VIF'] = [
        variance_inflation_factor(X_imputed.values, i)
        for i in range(X_imputed.shape[1])
    ]

    vif_data.sort_values('VIF', ascending=False, inplace=True)
    vif_data.reset_index(drop=True, inplace=True)
    vif_data['#'] = vif_data.index + 1

    return vif_data[['Feature', 'VIF', '#']]

In [9]:
vif_result_1pass = vif_one_pass()
print(vif_result_1pass.to_string(index=False, float_format="{:,.2f}".format))

                                                                                                              Feature                      VIF   #
                                                                                          Population ages 0-14, total 4,503,599,627,370,496.00   1
                                                                                           Population ages 0-14, male 1,501,199,875,790,165.25   2
                                                                                         Population ages 0-14, female 1,286,742,750,677,284.50   3
                                                Self-employed, female (% of female employment) (modeled ILO estimate)           393,243,042.74   4
                                    Wage and salaried workers, female (% of female employment) (modeled ILO estimate)           316,340,971.38   5
                                                            Total greenhouse gas emissions excluding LULUCF (Mt CO2e) 

### Compare Results

In [10]:
def compare_vif_dfs(vif_cv_df, vif_1p_df):
    comparison_df = pd.merge(
        vif_cv_df[['Feature', 'VIF_mean']],
        vif_1p_df[['Feature', 'VIF']],
        on='Feature',
        how='inner'
    )

    comparison_df = comparison_df.rename(columns={
        'VIF_mean': 'vif_cv',
        'VIF': 'vif_1p'
    })
    comparison_df['vif_diff'] = (comparison_df['vif_cv'] - comparison_df['vif_1p']).abs()
    #comparison_df['vif_diff'] = (comparison_df['vif_cv'] - comparison_df['vif_1p'])
    return comparison_df.sort_values('vif_diff', ascending=False).reset_index(drop=True)


In [11]:
compare_dfs = compare_vif_dfs(vif_result_cv, vif_result_1pass)
print(compare_dfs.to_string(index=False, float_format="{:,.2f}".format))

                                                                                                              Feature                   vif_cv                   vif_1p               vif_diff
                                                                                          Population ages 0-14, total 5,404,319,552,844,595.00 4,503,599,627,370,496.00 900,719,925,474,099.00
                                                                                           Population ages 0-14, male 1,561,247,870,821,771.75 1,501,199,875,790,165.25  60,047,995,031,606.50
                                                                                         Population ages 0-14, female 1,329,634,175,699,860.75 1,286,742,750,677,284.50  42,891,425,022,576.25
                                                Self-employed, female (% of female employment) (modeled ILO estimate)           554,406,628.57           393,243,042.74         161,163,585.83
                                    Wage and 

In [12]:
#count how many features have absolute difference between cv and one-pass of greater than...
threshold = 1

count_high_diff = (compare_dfs['vif_diff'] > threshold).sum()

print(f"Number of features with vif_diff > {threshold}: {count_high_diff}")

Number of features with vif_diff > 1: 197
