In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
from numpy.linalg import svd

In [2]:
file_path = "data/Panel_data_calculation_results817.csv"
df = pd.read_csv(file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name_of_Regions_x  1452 non-null   object 
 1   Year               1452 non-null   int64  
 2   Citycode           1452 non-null   float64
 3   UI                 1452 non-null   float64
 4   Name_of_Regions_y  1452 non-null   object 
 5   UER                1452 non-null   float64
 6   C                  1452 non-null   float64
 7   T                  1452 non-null   float64
 8   D                  1452 non-null   float64
 9   UI_normalized      1452 non-null   float64
 10  UER_normalized     1452 non-null   float64
 11  C2                 1452 non-null   float64
 12  T2                 1452 non-null   float64
 13  D2                 1452 non-null   float64
 14  Delta_UER          1160 non-null   float64
 15  Delta_UI           1160 non-null   float64
 16  DI                 1160 

In [4]:
# Calculate VIF for each year
years = [2006, 2010, 2014, 2018, 2022]
df_filtered = df[df['Year'].isin(years)]

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    return vif_data

for year in years:
    print(f"VIF for the year {year}:")
    df_year = df_filtered[df_filtered['Year'] == year][['UI', 'UER']]
    vif = calculate_vif(df_year)
    print(vif)
    print("\n")

VIF for the year 2006:
  feature       VIF
0      UI  1.865744
1     UER  1.865744


VIF for the year 2010:
  feature       VIF
0      UI  1.908362
1     UER  1.908362


VIF for the year 2014:
  feature       VIF
0      UI  1.913401
1     UER  1.913401


VIF for the year 2018:
  feature       VIF
0      UI  1.859703
1     UER  1.859703


VIF for the year 2022:
  feature      VIF
0      UI  1.76316
1     UER  1.76316




In [6]:
# Calculate VIF for each year
years = [2006, 2010, 2014, 2018, 2022]
df_filtered = df[df['Year'].isin(years)]

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    return vif_data

for year in years:
    print(f"VIF for the year {year}:")
    df_year = df_filtered[df_filtered['Year'] == year][['UI_normalized', 'UER_normalized']]
    vif = calculate_vif(df_year)
    print(vif)
    print("\n")

VIF for the year 2006:
          feature       VIF
0   UI_normalized  1.535797
1  UER_normalized  1.535797


VIF for the year 2010:
          feature      VIF
0   UI_normalized  1.60186
1  UER_normalized  1.60186


VIF for the year 2014:
          feature       VIF
0   UI_normalized  1.658606
1  UER_normalized  1.658606


VIF for the year 2018:
          feature       VIF
0   UI_normalized  1.647261
1  UER_normalized  1.647261


VIF for the year 2022:
          feature       VIF
0   UI_normalized  1.591913
1  UER_normalized  1.591913




In [7]:
# Function to calculate the Condition Index (CI)
def calculate_condition_index(X):
    _, s, _ = svd(X)
    condition_index = np.max(s) / s
    return condition_index

years = [2006, 2010, 2014, 2018, 2022]
for year in years:
    # Filter the data for the specific year
    df_year = df[df['Year'] == year]
    
    # Select the columns for UI_normalized and UER_normalized
    X = df_year[['UI', 'UER']].values
    
    # Calculate the Condition Index
    condition_index = calculate_condition_index(X)
    
    print(f"Condition Index for the year {year}:")
    print(condition_index)

    # Analyze the maximum condition index
    max_ci = np.max(condition_index)
    if max_ci > 15:
        print(f"Warning: Potential multicollinearity detected in the year {year} with a Condition Index of {max_ci:.2f}")
    else:
        print(f"No significant multicollinearity detected for the year {year}.\n")

Condition Index for the year 2006:
[1.         2.35197039]
No significant multicollinearity detected for the year 2006.

Condition Index for the year 2010:
[1.         2.40269449]
No significant multicollinearity detected for the year 2010.

Condition Index for the year 2014:
[1.        2.3431132]
No significant multicollinearity detected for the year 2014.

Condition Index for the year 2018:
[1.         2.29845135]
No significant multicollinearity detected for the year 2018.

Condition Index for the year 2022:
[1.         2.24312589]
No significant multicollinearity detected for the year 2022.



In [8]:
# Function to calculate the Condition Index (CI)
def calculate_condition_index(X):
    _, s, _ = svd(X)
    condition_index = np.max(s) / s
    return condition_index

years = [2006, 2010, 2014, 2018, 2022]
for year in years:
    # Filter the data for the specific year
    df_year = df[df['Year'] == year]
    
    # Select the columns for UI_normalized and UER_normalized
    X = df_year[['UI_normalized', 'UER_normalized']].values
    
    # Calculate the Condition Index
    condition_index = calculate_condition_index(X)
    
    print(f"Condition Index for the year {year}:")
    print(condition_index)

    # Analyze the maximum condition index
    max_ci = np.max(condition_index)
    if max_ci > 15:
        print(f"Warning: Potential multicollinearity detected in the year {year} with a Condition Index of {max_ci:.2f}")
    else:
        print(f"No significant multicollinearity detected for the year {year}.\n")

Condition Index for the year 2006:
[ 1.         12.04275199]
No significant multicollinearity detected for the year 2006.

Condition Index for the year 2010:
[ 1.        12.4590478]
No significant multicollinearity detected for the year 2010.

Condition Index for the year 2014:
[ 1.         10.75161655]
No significant multicollinearity detected for the year 2014.

Condition Index for the year 2018:
[1.         9.45937318]
No significant multicollinearity detected for the year 2018.

Condition Index for the year 2022:
[1.         8.42770386]
No significant multicollinearity detected for the year 2022.

