In [24]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

def check_multicollinearity(df, features, threshold=_):
    """
    Check for potential multicollinearity issues in a dataset using Variance Inflation Factor (VIF).
    
    Parameters:
    df (pd.DataFrame): The dataframe containing the data.
    features (list): List of feature/column names in df (pd.DataFrame) to check for multicollinearity.
    threshold (float): The VIF value threshold above which multicollinearity is flagged. Must Specify Value.
    
    Returns:
    pd.DataFrame: DataFrame containing features and their corresponding VIF values.
    str: Message indicating if multicollinearity is detected or not.
    """
    # The independent variables set
    X = df[features]

    # VIF dataframe
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns

    # Calculating VIF for each feature
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

    print(vif_data)

    if any(vif_data['VIF'] > threshold):
        return vif_data, 'Possible multicollinearity detected.'
    else:
        return vif_data, 'No multicollinearity detected.'

# Example usage
df = pd.DataFrame({'A': [1456, 2546, 2453], 'B': [8344, 1885, 9656], 'C': [1787, 4568, 2349], 'D': [1330, 7111, 3312]})
features = ['A', 'B', 'C']
vif_data, message = check_multicollinearity(df, features, threshold=9.5)
print(message)


  feature        VIF
0       A  88.244358
1       B  11.763105
2       C  50.069218
Possible multicollinearity detected.
