In [1]:
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:

cluster1_phase2 = pd.read_csv('data_stationnaires/Cluster1_Phase_II.csv')
cluster1_phase3 = pd.read_csv('data_stationnaires/Cluster1_Phase_III.csv')
cluster0_phase2 = pd.read_csv('data_stationnaires/Cluster0_Phase_II.csv')
cluster0_phase3 = pd.read_csv('data_stationnaires/Cluster0_Phase_III.csv')
cluster1_periode1 = pd.read_csv('data_stationnaires/Cluster1_periode1.csv')
cluster1_periode2 = pd.read_csv('data_stationnaires/Cluster1_periode2.csv')
cluster0_periode1 = pd.read_csv('data_stationnaires/Cluster0_periode1.csv')
cluster0_periode2 = pd.read_csv('data_stationnaires/Cluster0_periode2.csv')

In [3]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm


def calculate_vifs(dataframe):
    vifs = pd.DataFrame()
    vifs["Variable"] = dataframe.columns
    vifs["VIF"] = [variance_inflation_factor(dataframe.values, i) for i in range(dataframe.shape[1])]
    return vifs

def clean_data(X):
    removed_vars = []

    # Boucle jusqu'à ce que toutes les variables aient un VIF inférieur à 10
    while True:
        VIFs = calculate_vifs(X)
        # Trouve la variable avec le VIF maximum qui n'est pas const
        max_vif = VIFs[VIFs["Variable"] != 'const'].max()
        if max_vif["VIF"] < 10:
            break  # Sort de la boucle si toutes les variables ont un VIF < 10
        variable_to_remove = VIFs[VIFs["VIF"] == max_vif["VIF"]]["Variable"].values[0]
        X.drop(columns=[variable_to_remove], inplace=True)
        removed_vars.append(variable_to_remove)
        
    print("Variables removed:", removed_vars)
    print(VIFs)
    return removed_vars



In [4]:
X = cluster1_phase3.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster1_phase3.drop(columns=removed_vars, inplace=True) 
cluster1_phase3.shape


Variables removed: []
              Variable           VIF
0                const  9.900850e+10
1         Unnamed: 0.1  1.611344e+00
2             InVolReg  5.133495e+00
3            OutVolReg  4.692022e+00
4              TVolFin  5.430595e+00
5            WalletReg  4.544030e+00
6            WalletFin  2.994578e+00
7        Coal_price_EU  2.222238e+00
8        Ngas_price_EU  2.506534e+00
9         Oil_price_EU  2.314259e+00
10                 HDD  7.136893e+00
11                 CDD  6.792028e+00
12                 ESI  3.286293e+00
13                 IPI  2.607870e+00
14       InVolReg_lag2  7.416512e+00
15      OutVolReg_lag1  2.637731e+00
16      OutVolReg_lag2  5.845247e+00
17       InVolFin_lag1  4.579894e+00
18       InVolFin_lag2  6.474278e+00
19      OutVolFin_lag2  4.455308e+00
20        TVolFin_lag2  7.777200e+00
21      WalletReg_lag1  3.667074e+00
22      WalletReg_lag2  2.481071e+00
23      WalletFin_lag1  4.538675e+00
24      WalletFin_lag2  3.810731e+00
25          Pric

(94, 41)

In [5]:
X = cluster1_phase2.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster1_phase2.drop(columns=removed_vars, inplace=True) 
cluster1_phase2.shape

Variables removed: ['WalletReg_lag1']
              Variable           VIF
0                const  25803.627557
1         Unnamed: 0.1      2.089658
2             InVolReg      4.767022
3            OutVolReg      7.644123
4             InVolFin      5.949276
5            OutVolFin      3.763212
6            WalletFin      3.426100
7        Coal_price_EU      4.442996
8        Ngas_price_EU      5.835166
9         Oil_price_EU      5.687977
10                 HDD      6.312386
11                 IPI      3.026763
12       InVolReg_lag2      7.784877
13      OutVolReg_lag1      5.065151
14      OutVolReg_lag2      6.178890
15       InVolFin_lag1      3.639064
16       InVolFin_lag2      4.832326
17      OutVolFin_lag1      3.627764
18      OutVolFin_lag2      3.607961
19      WalletFin_lag1      5.450007
20      WalletFin_lag2      7.669523
21          Price_lag1      2.415097
22          Price_lag2      2.457253
23  Coal_price_EU_lag1      5.120313
24  Coal_price_EU_lag2      3.555560


(59, 36)

In [6]:
X = cluster0_phase3.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster0_phase3.drop(columns=removed_vars, inplace=True) 
cluster0_phase3.shape

Variables removed: []
              Variable           VIF
0                const  5.875482e+08
1         Unnamed: 0.1  1.504614e+00
2             InVolReg  4.191238e+00
3            OutVolReg  5.203900e+00
4              TVolFin  2.709047e+00
5            WalletReg  3.723966e+00
6            WalletFin  2.250881e+00
7        Coal_price_EU  1.851158e+00
8        Ngas_price_EU  2.376694e+00
9         Oil_price_EU  2.291131e+00
10                 HDD  8.270715e+00
11                 CDD  7.668709e+00
12                 ESI  2.957012e+00
13                 IPI  2.566581e+00
14       InVolReg_lag1  7.548327e+00
15       InVolReg_lag2  9.526668e+00
16      OutVolReg_lag1  7.329916e+00
17      OutVolReg_lag2  8.854334e+00
18      OutVolFin_lag2  3.396140e+00
19        TVolFin_lag1  3.059906e+00
20      WalletReg_lag1  4.434855e+00
21      WalletReg_lag2  3.913244e+00
22          Price_lag1  1.645574e+00
23          Price_lag2  1.725262e+00
24  Coal_price_EU_lag1  1.914656e+00
25  Coal_price_E

(94, 39)

In [7]:
X = cluster0_phase2.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster0_phase2.drop(columns=removed_vars, inplace=True) 
cluster0_phase2.shape

Variables removed: []
              Variable           VIF
0                const  29136.318408
1         Unnamed: 0.1      6.527687
2             InVolReg      5.370160
3             InVolFin      3.880395
4            OutVolFin      3.839961
5            WalletReg      6.221025
6            WalletFin      6.382337
7        Coal_price_EU      4.796738
8        Ngas_price_EU      3.779502
9         Oil_price_EU      5.338519
10                 HDD      6.307708
11                 IPI      4.266049
12       InVolReg_lag1      8.615176
13       InVolReg_lag2      5.373570
14      OutVolReg_lag1      6.660313
15       InVolFin_lag1      6.793630
16       InVolFin_lag2      4.582504
17      OutVolFin_lag1      3.554270
18      OutVolFin_lag2      4.938507
19      WalletReg_lag2      4.452159
20      WalletFin_lag2      6.255523
21          Price_lag1      2.865827
22          Price_lag2      2.368564
23  Coal_price_EU_lag1      5.284723
24  Coal_price_EU_lag2      4.307547
25  Ngas_price_E

(58, 36)

In [8]:
X = cluster0_periode1.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster0_periode1.drop(columns=removed_vars, inplace=True) 
cluster0_periode1.shape

Variables removed: ['Unnamed: 0.1']
              Variable           VIF
0                const  24550.472545
1              TVolReg      5.223045
2            WalletReg      7.891291
3        Coal_price_EU      5.197935
4        Ngas_price_EU      3.592454
5         Oil_price_EU      4.272498
6                  HDD      5.847825
7                  IPI      3.480217
8        InVolReg_lag1      7.908876
9        InVolReg_lag2      7.423894
10       InVolFin_lag2      7.315615
11      OutVolFin_lag1      7.860903
12      OutVolFin_lag2      7.539461
13        TVolFin_lag1      6.977590
14        TVolFin_lag2      9.828512
15      WalletReg_lag1      7.591830
16      WalletReg_lag2      7.513023
17      WalletFin_lag2      3.155566
18          Price_lag1      3.158702
19          Price_lag2      3.187444
20  Coal_price_EU_lag1      5.936047
21  Coal_price_EU_lag2      4.474689
22  Ngas_price_EU_lag1      4.885786
23  Ngas_price_EU_lag2      5.526023
24   Oil_price_EU_lag1      7.481553
25

(43, 33)

In [9]:
X = cluster0_periode2.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster0_periode2.drop(columns=removed_vars, inplace=True) 
cluster0_periode2.shape

Variables removed: []
              Variable           VIF
0                const  14911.123289
1         Unnamed: 0.1      1.720764
2             InVolReg      3.750571
3            OutVolReg      3.816283
4             InVolFin      2.177731
5            OutVolFin      3.884938
6            WalletFin      6.561803
7        Coal_price_EU      1.729779
8        Ngas_price_EU      2.110240
9         Oil_price_EU      1.923210
10                 HDD      5.963236
11                 CDD      5.777186
12                 IPI      1.782324
13       InVolReg_lag1      6.829337
14       InVolReg_lag2      7.017952
15      OutVolReg_lag1      4.453847
16      OutVolReg_lag2      4.916220
17       InVolFin_lag1      2.286828
18       InVolFin_lag2      1.627651
19      OutVolFin_lag1      4.014247
20      OutVolFin_lag2      4.283064
21      WalletFin_lag2      6.049612
22          Price_lag1      1.508094
23          Price_lag2      1.547763
24  Coal_price_EU_lag1      1.873017
25  Coal_price_E

(105, 37)

In [10]:
X = cluster1_periode1.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster1_periode1.drop(columns=removed_vars, inplace=True) 
cluster1_periode1.shape

Variables removed: []
              Variable           VIF
0                const  72200.047180
1         Unnamed: 0.1      6.223127
2              TVolFin      5.756368
3              TVolReg      2.342383
4            WalletReg      7.671663
5            WalletFin      3.615533
6        Coal_price_EU      2.993239
7        Ngas_price_EU      4.353826
8                  HDD      5.291645
9                  IPI      3.762002
10      OutVolReg_lag1      4.081088
11      OutVolReg_lag2      3.273595
12       InVolFin_lag1      3.562511
13       InVolFin_lag2      3.892654
14      OutVolFin_lag1      3.468488
15      OutVolFin_lag2      5.168971
16      WalletFin_lag1      3.755582
17          Price_lag1      2.303000
18          Price_lag2      2.948489
19  Coal_price_EU_lag1      2.910734
20  Coal_price_EU_lag2      3.224193
21  Ngas_price_EU_lag1      3.595576
22  Ngas_price_EU_lag2      3.729580
23   Oil_price_EU_lag2      3.614132
24            CDD_lag1      5.780149
25            CD

(44, 31)

In [11]:
X = cluster1_periode2.drop(columns=['Price', 'Date', 'Unnamed: 0'])
X = sm.add_constant(X)
X.dropna(inplace=True)
removed_vars = clean_data(X)
cluster1_periode2.drop(columns=removed_vars, inplace=True) 
cluster1_periode2.shape

Variables removed: []
              Variable       VIF
0                const  9.826841
1         Unnamed: 0.1  1.618376
2             InVolReg  5.269466
3            OutVolReg  4.463993
4             InVolFin  3.116314
5            OutVolFin  4.222565
6        Coal_price_EU  1.838204
7        Ngas_price_EU  2.137446
8         Oil_price_EU  1.891737
9                  HDD  5.981370
10                 CDD  5.938100
11                 IPI  1.954795
12       InVolReg_lag1  8.606305
13       InVolReg_lag2  6.921420
14      OutVolReg_lag1  4.773585
15      OutVolReg_lag2  5.451458
16       InVolFin_lag1  3.627907
17       InVolFin_lag2  3.835339
18      OutVolFin_lag1  5.493400
19      OutVolFin_lag2  3.445031
20      WalletReg_lag1  8.833976
21      WalletReg_lag2  8.175055
22          Price_lag1  1.590164
23          Price_lag2  1.490380
24  Coal_price_EU_lag1  1.971622
25  Coal_price_EU_lag2  1.669482
26  Ngas_price_EU_lag1  2.060460
27  Ngas_price_EU_lag2  1.849918
28   Oil_price_EU_lag

(105, 37)

In [12]:
cluster1_phase2.to_csv('data_stationnaires/Cluster1_Phase_II.csv')
cluster1_phase3.to_csv('data_stationnaires/Cluster1_Phase_III.csv')
cluster0_phase2.to_csv('data_stationnaires/Cluster0_Phase_II.csv')
cluster0_phase3.to_csv('data_stationnaires/Cluster0_Phase_III.csv')
cluster1_periode1.to_csv('data_stationnaires/Cluster1_periode1.csv')
cluster1_periode2.to_csv('data_stationnaires/Cluster1_periode2.csv')
cluster0_periode1.to_csv('data_stationnaires/Cluster0_periode1.csv')
cluster0_periode2.to_csv('data_stationnaires/Cluster0_periode2.csv')