In [218]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLarsIC

In [219]:
df = pd.read_csv('Data/macro1.csv')
df.head()

Unnamed: 0,sasdate,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST,VXOCLSx
0,Transform:,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,6.0,6.0,6.0,6.0,2.0,6.0,6.0,6.0,6.0,1.0
1,01/01/1959,2289.8,2151.9,18.191,253747.578885,18234.44037,21.9289,21.5499,20.9407,28.483,...,12.133,2.13,2.45,2.04,,274.9,6476.0,12298.0,84.2,
2,02/01/1959,2299.6,2160.2,18.38,255653.461901,18368.21974,22.3584,21.8408,21.1221,28.6919,...,12.149,2.13,2.46,2.05,,276.0,6476.0,12298.0,83.5,
3,03/01/1959,2314.4,2176.1,18.555,254743.765035,18521.70306,22.6805,21.973,21.2257,28.6919,...,12.169,2.15,2.45,2.07,,277.4,6508.0,12349.0,81.6,
4,04/01/1959,2328.5,2190.3,18.488,259878.438046,18533.1106,23.1636,22.2903,21.5367,29.1445,...,12.211,2.16,2.47,2.08,,278.1,6620.0,12484.0,81.8,


In [220]:
# We remove the first row
df = df.iloc[1:]

In [221]:
# We compute the GrowthRate using 'INDPRO', the index corresponding to the Total industrial production.
df['sasdate'] = pd.to_datetime(df['sasdate'], format='%m/%d/%Y')
df['GrowthRate_INDPRO'] = df['INDPRO'].pct_change() * 100

In [223]:
df.head()

Unnamed: 0,sasdate,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST,VXOCLSx,GrowthRate_INDPRO
1,1959-01-01,2289.8,2151.9,18.191,253747.578885,18234.44037,21.9289,21.5499,20.9407,28.483,...,2.13,2.45,2.04,,274.9,6476.0,12298.0,84.2,,
2,1959-02-01,2299.6,2160.2,18.38,255653.461901,18368.21974,22.3584,21.8408,21.1221,28.6919,...,2.13,2.46,2.05,,276.0,6476.0,12298.0,83.5,,1.958603
3,1959-03-01,2314.4,2176.1,18.555,254743.765035,18521.70306,22.6805,21.973,21.2257,28.6919,...,2.15,2.45,2.07,,277.4,6508.0,12349.0,81.6,,1.440622
4,1959-04-01,2328.5,2190.3,18.488,259878.438046,18533.1106,23.1636,22.2903,21.5367,29.1445,...,2.16,2.47,2.08,,278.1,6620.0,12484.0,81.8,,2.130024
5,1959-05-01,2342.5,2205.6,18.71,261691.222172,18678.29753,23.5125,22.4754,21.7441,29.2838,...,2.17,2.48,2.08,95.3,280.1,6753.0,12646.0,80.7,,1.506243


In [224]:
# We remove the features in which there are a lot of Nan values
pd.DataFrame(df.isna().sum()).sort_values(0, ascending=False).iloc[:10]

Unnamed: 0,0
ACOGNO,398
TWEXMMTH,168
UMCSENTx,154
ANDENOx,109
VXOCLSx,42
PERMITW,12
PERMITS,12
PERMITMW,12
PERMITNE,12
PERMIT,12


In [225]:
columns_to_drop = ['ACOGNO', 'TWEXMMTH', 'UMCSENTx', 'ANDENOx', 'VXOCLSx']

In [226]:
df = df.drop(columns_to_drop,axis=1)

In [228]:
# We remove the remaining NaN values
df = df.dropna()

In [229]:
# We reorder the columns
columns = df.columns.tolist()
new_column_order = ['sasdate', 'GrowthRate_INDPRO'] + [column for column in columns if column not in ['sasdate', 'GrowthRate_INDPRO']]

In [230]:
df = df[new_column_order]

In [231]:
df.head()

Unnamed: 0,sasdate,GrowthRate_INDPRO,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,...,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST
13,1960-01-01,2.625412,2391.2,2252.3,18.91,263212.787497,18762.29853,24.1298,23.3215,22.6512,...,54.611,20.0,12.502,2.22,2.57,2.13,286.7,7362.0,14063.0,75.5
14,1960-02-01,-0.889771,2392.8,2253.8,18.993,267039.890455,18831.78084,23.9151,23.1893,22.5734,...,54.7,20.001,12.522,2.23,2.59,2.14,287.0,7396.0,14144.0,73.3
15,1960-03-01,-0.897759,2397.2,2255.4,19.262,259610.17981,18811.03986,23.7004,23.11,22.5475,...,54.54,20.037,12.529,2.24,2.69,2.14,287.8,7456.0,14239.0,71.5
16,1960-04-01,-0.792814,2405.6,2263.4,19.56,262179.545533,19304.67539,23.5125,23.1629,22.5734,...,54.606,20.163,12.555,2.24,2.61,2.14,288.3,7582.0,14413.0,71.5
17,1960-05-01,-0.113982,2411.7,2269.4,19.166,254056.886617,19016.37565,23.4857,23.2951,22.7289,...,54.6,20.154,12.59,2.24,2.64,2.14,289.1,7673.0,14559.0,71.7


In [232]:
# We remove the INDPRO column as we now have our Growth rate
df.pop('INDPRO')

13      24.1298
14      23.9151
15      23.7004
16      23.5125
17      23.4857
         ...   
663    103.1059
664    103.1885
665    103.6839
666    104.1050
667    104.4497
Name: INDPRO, Length: 655, dtype: float64

In [233]:
'INDPRO' in df.columns

False

In [234]:
num_predictors = len(df.columns) - 2
num_predictors

120

In [246]:
# We initialize the parameters for the Gibbs
T = len(df) # Number of observations
a, b, A, B = 1, 1, 1, 1 # Parameters for Beta laws
q = np.random.beta(a, b, 1)[0]
R_2 = np.random.beta(A, B, 1)[0]

In [247]:
q, R_2

(0.19187518363766315, 0.7603707855695392)

In [248]:
# We define the outcome y and the predictors X
X = df.drop(['sasdate', 'GrowthRate_INDPRO'], axis=1)
y = df['GrowthRate_INDPRO']

In [249]:
# We initialize our z using Lasso Regression.
# First, we need to find the best regularization term using Bayesian Criterion (BIC)
lasso_ic_model = LassoLarsIC(criterion='bic')
lasso_ic_model.fit(X, y)
best_alpha_bic = lasso_ic_model.alpha_

In [250]:
best_alpha_bic

2.236924163052226

In [251]:
# Now, we run the Lasso Regression using our best regularization term.
lasso_model = Lasso(alpha=best_alpha_bic)
lasso_model.fit(X, y)
beta_estimates = lasso_model.coef_

  model = cd_fast.enet_coordinate_descent(


In [188]:
z_initial = (np.abs(beta_estimates) > 0).astype(int)

In [191]:
z_initial

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0])