<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Using-numpy-scipy" data-toc-modified-id="Using-numpy-scipy-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Using numpy scipy</a></span></li><li><span><a href="#Using-pandas" data-toc-modified-id="Using-pandas-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Using pandas</a></span></li><li><span><a href="#Using-statsmodels-(we-must-add-ones)" data-toc-modified-id="Using-statsmodels-(we-must-add-ones)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Using statsmodels (we must add ones)</a></span></li></ul></div>

# Using numpy scipy

In [1]:
import numpy as np
import scipy as sp

a = [1, 1, 2, 3, 4]
b = [2, 2, 3, 2, 1]
c = [4, 6, 7, 8, 9]
d = [4, 3, 4, 5, 4]

ck = np.column_stack([a, b, c, d])
cc = sp.corrcoef(ck, rowvar=False)
VIF = np.linalg.inv(cc)
VIF.diagonal()

array([22.95,  3.  , 12.95,  3.  ])

# Using pandas

In [18]:
import pandas as pd
import numpy as np

a = [1, 1, 2, 3, 4]
b = [2, 2, 3, 2, 1]
c = [4, 6, 7, 8, 9]
d = [4, 3, 4, 5, 4]

df = pd.DataFrame({'a':a,'b':b,'c':c,'d':d})
df_cor = df.corr()

df_vif = pd.DataFrame(np.linalg.inv(df.corr().values), index = df_cor.index, columns=df_cor.columns) # diags are vifs

ser_vif = pd.Series(np.linalg.inv(df.corr().values).diagonal(), index=df_cor.index)
ser_vif

a    22.95
b     3.00
c    12.95
d     3.00
dtype: float64

In [19]:
df_vif

Unnamed: 0,a,b,c,d
a,22.95,6.453681,-16.301917,-6.453681
b,6.453681,3.0,-4.080441,-2.0
c,-16.301917,-4.080441,12.95,4.080441
d,-6.453681,-2.0,4.080441,3.0


In [22]:
np.diag(df_vif.values)

array([22.95,  3.  , 12.95,  3.  ])

# Using statsmodels (we must add ones)

In [5]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# For each Xi, calculate VIF
vif = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
vif

[47.136986301369774, 28.93150684931508, 80.31506849315096, 40.43835616438355]

In [7]:
# in statsmodels we must add ones to the data
df1 = df.assign(const=1)
vif_data = [variance_inflation_factor(df1.values, i) for i in range(df1.shape[1])]
vif_ser = pd.Series(vif_data, index=df1.columns)
vif_ser

a         22.950
b          3.000
c         12.950
d          3.000
const    136.875
dtype: float64

In [8]:
from statsmodels.regression.linear_model import OLS

exog_df = sm.add_constant(df)

vif_data = [1 / (1. - OLS(exog_df[col].values, 
                       exog_df.loc[:, exog_df.columns != col].values).fit().rsquared) 
         for col in exog_df]

vif_ser = pd.Series(vif_data,index=exog_df.columns,name='VIF')
vif_ser

  return ptp(axis=axis, out=out, **kwargs)


const    136.875
a         22.950
b          3.000
c         12.950
d          3.000
Name: VIF, dtype: float64

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

def get_vif(exogs, data):
    '''Return VIF (variance inflation factor) DataFrame

    Args:
    exogs (list): list of exogenous/independent variables
    data (DataFrame): the df storing all variables

    Returns:
    VIF and Tolerance DataFrame for each exogenous variable

    Notes:
    Assume we have a list of exogenous variable [X1, X2, X3, X4].
    To calculate the VIF and Tolerance for each variable, we regress
    each of them against other exogenous variables. For instance, the
    regression model for X3 is defined as:
                        X3 ~ X1 + X2 + X4
    And then we extract the R-squared from the model to calculate:
                    VIF = 1 / (1 - R-squared)
                    Tolerance = 1 - R-squared
    The cutoff to detect multicollinearity:
                    VIF > 10 or Tolerance < 0.1
    '''

    # initialize dictionaries
    vif_dict, tolerance_dict = {}, {}

    # create formula for each exogenous variable
    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        formula = f"{exog} ~ {' + '.join(not_exog)}"

        # extract r-squared from the fit
        r_squared = smf.ols(formula, data=data).fit().rsquared

        # calculate VIF
        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        # calculate tolerance
        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    # return VIF DataFrame
    df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

    return df_vif

In [14]:
import seaborn as sns

df = sns.load_dataset('car_crashes')
exogs = ['alcohol', 'speeding', 'no_previous', 'not_distracted']

print(df.shape)
df.head()

(51, 8)


Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


In [15]:
%%timeit -n 100
get_vif(exogs=exogs, data=df)

36.8 ms ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
get_vif(exogs=exogs, data=df)

Unnamed: 0,VIF,Tolerance
alcohol,3.436072,0.29103
no_previous,3.113984,0.321132
not_distracted,2.668456,0.374749
speeding,1.88434,0.53069


In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression

def sklearn_vif(exogs, data):

    # initialize dictionaries
    vif_dict, tolerance_dict = {}, {}

    # form input data for each exogenous variable
    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        X, y = data[not_exog], data[exog]

        # extract r-squared from the fit
        r_squared = LinearRegression().fit(X, y).score(X, y)

        # calculate VIF
        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        # calculate tolerance
        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    # return VIF DataFrame
    df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

    return df_vif

In [12]:
%%timeit -n 100
sklearn_vif(exogs=exogs, data=df)

8.52 ms ± 472 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
sklearn_vif(exogs=exogs, data=df)

Unnamed: 0,VIF,Tolerance
alcohol,3.436072,0.29103
no_previous,3.113984,0.321132
not_distracted,2.668456,0.374749
speeding,1.88434,0.53069


In [24]:
a = 'fruits'
lst = ['apple','banana','orange']
ans = f"{a} = {', '.join(lst)}"
ans

'fruits = apple, banana, orange'