# Regime Forecasting II

Preparation of the dataset for recession forecasting. Cleaning of the data and feature selection to reduce the number of variables in the data.

## 1. Set Up Environment and Read Data <a id="1"></a>

In [2]:
#load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from statsmodels.tsa.stattools import adfuller #to check unit root in time series 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

import seaborn as sns #for correlation heatmap

import warnings
warnings.filterwarnings('ignore')

In [3]:
bigmacro=pd.read_csv("Macroeconomic_Variables.csv")
bigmacro=bigmacro.rename(columns={'sasdate':'Date'})
Recession_periods=pd.read_csv('Recession_Periods.csv')
bigmacro.insert(loc=1,column="Regime", value=Recession_periods['Regime'].values)
bigmacro.head()

Unnamed: 0,Date,Regime,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,...,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST,VXOCLSx
0,1/1/59,Normal,2437.296,2288.8,17.302,292258.8329,18235.77392,22.6248,23.4555,22.1893,...,11.358,2.13,2.45,2.04,,274.9,6476.0,12298.0,84.2043,
1,2/1/59,Normal,2446.902,2297.0,17.482,294429.5453,18369.56308,23.0679,23.772,22.3816,...,11.375,2.14,2.46,2.05,,276.0,6476.0,12298.0,83.528,
2,3/1/59,Normal,2462.689,2314.0,17.647,293425.3813,18523.05762,23.4002,23.9159,22.4914,...,11.395,2.15,2.45,2.07,,277.4,6508.0,12349.0,81.6405,
3,4/1/59,Normal,2478.744,2330.3,17.584,299331.6505,18534.466,23.8987,24.2613,22.821,...,11.436,2.16,2.47,2.08,,278.1,6620.0,12484.0,81.8099,
4,5/1/59,Normal,2493.228,2345.8,17.796,301372.9597,18679.66354,24.2587,24.4628,23.0407,...,11.454,2.17,2.48,2.08,95.3,280.1,6753.0,12646.0,80.7315,


## 2. Data Cleaning <a id="2"></a>

We will follow the steps below to clean data and make it ready for feature selection process.

1. Remove the variables with missing observations
2. Add lags of the variables as additional features
3. Test stationarity of time series
4. Standardize the dataset

In [4]:
#remove columns with missing observations
missing_colnames=[]
for i in bigmacro.drop(['Date','Regime'],axis=1):
    observations=len(bigmacro)-bigmacro[i].count()
    if (observations>10):
        print(i+':'+str(observations))
        missing_colnames.append(i)
 
bigmacro=bigmacro.drop(labels=missing_colnames, axis=1)

#rows with missing values
bigmacro=bigmacro.dropna(axis=0)

bigmacro.shape

PERMIT:13
PERMITNE:13
PERMITMW:13
PERMITS:13
PERMITW:13
ACOGNO:398
ANDENOx:110
TWEXMMTH:168
UMCSENTx:155
VXOCLSx:42


(718, 120)

In [5]:
# Add lags
for col in bigmacro.drop(['Date', 'Regime'], axis=1):
    for n in [3,6,9,12,18]:
        bigmacro['{} {}M lag'.format(col, n)] = bigmacro[col].shift(n).ffill().values

# 1 month ahead prediction
bigmacro["Regime"]=bigmacro["Regime"].shift(-1)

bigmacro=bigmacro.dropna(axis=0)

In [6]:
bigmacro.shape

(699, 710)

Augmented Dickey-Fuller Test can be used to test for stationarity in macroeconomic time series variables. We will use `adfuller` function from `statsmodels` module in Python. More information about the function can be found __[here](https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html)__.

In [7]:
#check stationarity
from statsmodels.tsa.stattools import adfuller #to check unit root in time series 
threshold=0.01 #significance level
for column in bigmacro.drop(['Date','Regime'], axis=1):
    result=adfuller(bigmacro[column])
    if result[1]>threshold:
        bigmacro[column]=bigmacro[column].diff()
bigmacro=bigmacro.dropna(axis=0)

In [8]:
threshold=0.01 #significance level
for column in bigmacro.drop(['Date','Regime'], axis=1):
    result=adfuller(bigmacro[column])
    if result[1]>threshold:
        bigmacro[column]=bigmacro[column].diff()
bigmacro=bigmacro.dropna(axis=0)

In [9]:
threshold=0.01 #significance level
for column in bigmacro.drop(['Date','Regime'], axis=1):
    result=adfuller(bigmacro[column])
    if result[1]>threshold:
        print(column)
bigmacro=bigmacro.dropna(axis=0)      

CES0600000008 6M lag


In [10]:
# Standardize
from sklearn.preprocessing import StandardScaler
features=bigmacro.drop(['Date','Regime'],axis=1)
col_names=features.columns

scaler=StandardScaler()
scaler.fit(features)
standardized_features=scaler.transform(features)
standardized_features.shape
df=pd.DataFrame(data=standardized_features,columns=col_names)
df.insert(loc=0,column="Date", value=bigmacro['Date'].values)
df.insert(loc=1,column='Regime', value=bigmacro['Regime'].values)
df.head()
df.shape

(697, 710)

In [11]:
df.to_csv("Dataset_Cleaned.csv", index=False)