Setting up to test each column in the DataFrame for a unit root using the Augmented Dickey Fuller test

In [16]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

In [17]:
file_path = '/Users/asger/Documents/GitHub/Deep_Learning_Techniques/Master/Data/select_technical_all_nonstationary.csv'

df = pd.read_csv(file_path)

df.head()

Unnamed: 0,usd_eur_exchange,SMA_10,SMA_20,MACD_12,RSI,brent,eu_cpi,eu_mro_rate,eu_unemployment_rate,eu_yield_3m,...,eur_stoxx_vix,eur_stoxx,us_cpi,us_federal_fund_rate,us_sp500,us_sp500_vix,us_treasury_yield_3m,us_treasury_yield_10y,us_treasury_yield_30y,us_unemployment
0,-0.0003,1.35163,1.34143,0.009364,40.107019,40.75,2.3,2.0,8.9,2.055906,...,14.5103,2971.12,0.052383,2.25,118.83,13.98,2.33,4.29,4.91,5.4
1,-0.0105,1.35065,1.34163,0.007291,39.923894,41.0,2.3,2.0,8.9,2.064858,...,14.4159,2947.19,0.052383,2.25,118.01,14.09,2.33,4.29,4.88,5.4
2,-0.0125,1.34761,1.341195,0.004746,33.903826,43.25,2.3,2.0,8.9,2.060515,...,13.8016,2966.24,0.052383,2.25,118.61,13.58,2.31,4.29,4.89,5.4
3,0.0047,1.34287,1.340385,0.0017,28.266007,43.28,2.3,2.0,8.9,2.048016,...,13.5709,2979.82,0.052383,2.24,118.44,13.49,2.32,4.29,4.88,5.4
4,0.0052,1.33771,1.339385,-0.00033,32.925766,44.71,2.3,2.0,8.9,2.034898,...,12.9192,2977.21,0.052383,2.26,119.0,13.23,2.36,4.29,4.86,5.4


In [18]:
# Lets set up a for loop that rolls through each column except the date column, testing for a unit root using the statsmodel adfuller test

def check_stationarity(df):
    for column in df.columns:
        result = adfuller(df[column], autolag = 'AIC')
        print(f'ADF Statistic: {result[0]}')
        print(f'p-value: {result[1]}')
        print(f'Critical Values:')
        for key, value in result[4].items():
            print(f'\t{key}: {value}')
        print(f'Number of lags used: {result[2]}')
        print(f'Is {column} stationary? {"Yes" if result[1] < 0.05 else "No"}')
        print('\n')

check_stationarity(df)

ADF Statistic: -67.93618243102965
p-value: 0.0
Critical Values:
	1%: -3.431725411436783
	5%: -2.8621477762930767
	10%: -2.5670935212314934
Number of lags used: 0
Is usd_eur_exchange stationary? Yes


ADF Statistic: -1.9947782871253386
p-value: 0.28886732134328374
Critical Values:
	1%: -3.431734731460428
	5%: -2.8621518937308172
	10%: -2.5670957131211334
Number of lags used: 32
Is SMA_10 stationary? No


ADF Statistic: -1.7820117926330934
p-value: 0.38942819732725265
Critical Values:
	1%: -3.4317344382982022
	5%: -2.8621517642166197
	10%: -2.567095644175108
Number of lags used: 31
Is SMA_20 stationary? No


ADF Statistic: -11.152376374910144
p-value: 2.9219852888432915e-20
Critical Values:
	1%: -3.4317257007880992
	5%: -2.862147904124065
	10%: -2.567093589281407
Number of lags used: 1
Is MACD_12 stationary? Yes


ADF Statistic: -14.979056368225612
p-value: 1.1694609181940813e-27
Critical Values:
	1%: -3.431725411436783
	5%: -2.8621477762930767
	10%: -2.5670935212314934
Number of lags us

In [19]:
non_stationary = df[['SMA_10', 'SMA_20', 'eu_cpi', 'eu_mro_rate', 'eu_unemployment_rate', 'eu_yield_3m','eu_yield_10y', 'eu_yield_30y', 'eur_stoxx', 'us_federal_fund_rate','us_sp500', 'us_treasury_yield_3m', 'us_treasury_yield_10y', 'us_treasury_yield_30y', 'us_unemployment']]

# Differencing the non stationary DataFrame

non_stationary_diff = non_stationary.diff().dropna()

check_stationarity(non_stationary_diff)

ADF Statistic: -10.014888199729953
p-value: 1.7393897806437205e-17
Critical Values:
	1%: -3.431734731460428
	5%: -2.8621518937308172
	10%: -2.5670957131211334
Number of lags used: 31
Is SMA_10 stationary? Yes


ADF Statistic: -10.19183710435024
p-value: 6.29248851996659e-18
Critical Values:
	1%: -3.4317344382982022
	5%: -2.8621517642166197
	10%: -2.567095644175108
Number of lags used: 30
Is SMA_20 stationary? Yes


ADF Statistic: -10.442247394999049
p-value: 1.5114209402190425e-18
Critical Values:
	1%: -3.431732681928294
	5%: -2.862150988281327
	10%: -2.5670952311111415
Number of lags used: 24
Is eu_cpi stationary? Yes


ADF Statistic: -8.232377310573472
p-value: 6.01464055323788e-13
Critical Values:
	1%: -3.4317350247468372
	5%: -2.8621520232998643
	10%: -2.5670957820963594
Number of lags used: 32
Is eu_mro_rate stationary? Yes


ADF Statistic: -10.600738460773202
p-value: 6.180711890184115e-19
Critical Values:
	1%: -3.4317320974615604
	5%: -2.8621507300734508
	10%: -2.567095093655921

In [20]:
# Dropping the non stationary columns from the original dataframe df
df = df.drop(columns = ['SMA_10', 'SMA_20', 'eu_cpi', 'eu_mro_rate', 'eu_unemployment_rate', 'eu_yield_3m','eu_yield_10y', 'eu_yield_30y', 'eur_stoxx', 'us_federal_fund_rate','us_sp500', 'us_treasury_yield_3m', 'us_treasury_yield_10y', 'us_treasury_yield_30y', 'us_unemployment'])

# combining the stationary df with the differenced non stationary df
df = pd.concat([df, non_stationary_diff], axis = 1)

# Removing the first observation
df = df.iloc[2:]

df.to_csv('/Users/asger/Documents/GitHub/Deep_Learning_Techniques/Master/Data/final_dataset.csv', index = False)

In [22]:
# Creating data for second iteration

df_2 = df

In [23]:
# Removing the following columns; brent, eu_yield_3m, eu_yield_10y, eu_yield_30y, us_treasury_yield_3m, us_treasury_yield_10y, us_treasury_yield_30y, us_sp500, eur_stoxx

df_2 = df_2.drop(columns = ['brent', 'eu_yield_3m', 'eu_yield_10y', 'eu_yield_30y', 'us_treasury_yield_3m', 'us_treasury_yield_10y', 'us_treasury_yield_30y', 'us_sp500', 'eur_stoxx'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4756 entries, 2 to 4757
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usd_eur_exchange      4756 non-null   float64
 1   MACD_12               4756 non-null   float64
 2   RSI                   4756 non-null   float64
 3   eur_stoxx_vix         4756 non-null   float64
 4   us_cpi                4756 non-null   float64
 5   us_sp500_vix          4756 non-null   float64
 6   SMA_10                4756 non-null   float64
 7   SMA_20                4756 non-null   float64
 8   eu_cpi                4756 non-null   float64
 9   eu_mro_rate           4756 non-null   float64
 10  eu_unemployment_rate  4756 non-null   float64
 11  us_federal_fund_rate  4756 non-null   float64
 12  us_unemployment       4756 non-null   float64
dtypes: float64(13)
memory usage: 483.2 KB


In [25]:
df_2.to_csv('/Users/asger/Documents/GitHub/Deep_Learning_Techniques/Master/Data/final_dataset_iteration2.csv', index = False)