# Data Generating Process

In [28]:
import os
import numpy as np
from scipy.stats import norm, t, uniform
path = './Simu'
name1 = '/SimuData_p50'
name2 = '/SimuData_p100'
N = 200
m = 100
T = 180
stdv = 0.05
theta_w = 0.02
stde = 0.05
q = 0.95
M = 1

# Create directories
os.makedirs(path, exist_ok=True)
os.makedirs(path + name1, exist_ok=True)
os.makedirs(path + name2, exist_ok=True)

# Case Pc=100
rho = uniform.rvs(0.9, 0.1, size=(m, 1))
c = np.zeros((N * T, m))

for i in range(m):
    x = np.zeros((N, T))
    x[:, 0] = norm.rvs(0, 1, size=(N,))
    for t in range(1, T):
        x[:, t] = rho[i] * x[:, t - 1] + norm.rvs(0, 1, size=(N,)) * np.sqrt(1 - rho[i]**2)
    r = np.argsort(x, axis=0)
    szx = x.shape
    x1 = np.zeros(szx)
    ridx = np.arange(1, szx[0] + 1)
    for k in range(szx[1]):
        x1[r[:, k], k] = ridx * 2 / (N + 1) - 1
    c[:, i] = x1.flatten()

per = np.repeat(np.arange(1, N + 1), T)
time = np.tile(np.arange(1, T + 1), N)
vt = np.random.normal(0, 1, size=(3, T)) * stdv
beta = c[:, :3]
betav = np.zeros(N * T)

for t in range(T):
    ind = (time == t + 1)
    betav[ind] = np.dot(beta[ind, :], vt[:, t])

y = np.zeros(T)
y[0] = np.random.normal(0, 1)
for t in range(1, T):
    y[t] = q * y[t - 1] + np.random.normal(0, 1) * np.sqrt(1 - q**2)

cy = c.copy()
for t in range(T):
    ind = (time == t + 1)
    cy[ind, :] = c[ind, :] * y[t]

ep = np.random.standard_t(df=5, size=N * T) * stde

# Model 1
theta = np.concatenate(([1, 1], np.repeat(0, m - 2), [0, 0, 1], np.repeat(0, m - 3))) * theta_w
r1 = np.hstack((c, cy)).dot(theta) + betav + ep
rt = np.hstack((c, cy)).dot(theta)

# Saving arrays to CSV files
pathc = f"{path}{name2}/c{M}.csv"
np.savetxt(pathc, np.hstack((c, cy)), delimiter=',')

pathr = f"{path}{name2}/r1_{M}.csv"
np.savetxt(pathr, r1, delimiter=',')

In [29]:
# Create directories
os.makedirs(os.path.join(path, name2, '/r2'), exist_ok=True)

# Model 2
theta = np.concatenate(([1, 1], np.repeat(0, m - 2), [0, 0, 1], np.repeat(0, m - 3))) * theta_w
z = np.hstack((c, cy))
z[:, 0] = c[:, 0]**2 * 2
z[:, 1] = c[:, 0] * c[:, 1] * 1.5
z[:, m + 2] = np.sign(cy[:, 2]) * 0.6

r1 = z.dot(theta) + betav + ep
rt = z.dot(theta)

# Saving arrays to CSV files
pathr = f"{path}{name2}/r2_{M}.csv"
np.savetxt(pathr, r1, delimiter=',')

-----------

In [30]:
m = 50

# Model 1
theta = np.concatenate(([1, 1], np.repeat(0, m - 2), [0, 0, 1], np.repeat(0, m - 3))) * theta_w
r1 = np.hstack((c[:, :m], cy[:, :m])).dot(theta) + betav + ep
rt = np.hstack((c[:, :m], cy[:, :m])).dot(theta)

# Saving arrays to CSV files
pathc = f"{path}{name1}/c{M}.csv"
np.savetxt(pathc, np.hstack((c, cy)), delimiter=',')

pathr = f"{path}{name1}/r1_{M}.csv"
np.savetxt(pathr, r1, delimiter=',')


os.makedirs(os.path.join(path, name1, '/r2'), exist_ok=True)

# Model 2
theta = np.concatenate(([1, 1], np.repeat(0, m - 2), [0, 0, 1], np.repeat(0, m - 3))) * theta_w
z_model2 = np.hstack((c[:, :m], cy[:, :m]))
z_model2[:, 0] = c[:, 0]**2 * 2
z_model2[:, 1] = c[:, 0] * c[:, 1] * 1.5
z_model2[:, m + 2] = np.sign(cy[:, 2]) * 0.6

r1 = z_model2.dot(theta) + betav + ep
rt = z_model2.dot(theta)

# Saving arrays to CSV files
pathr = f"{path}{name1}/r2_{M}.csv"
np.savetxt(pathr, r1, delimiter=',')

# Linear Models

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from tqdm import tqdm

First define few functions

In [5]:
def fw1(x):
    # Find the maximum location of a vector
    maximum = np.max(x)
    p = np.where(x == maximum)[0]
    if len(p) > 1:
        p = p[0]
    return p

def pls(X, y, A):
    """
    Partial Least Squares (PLS) regression

    Parameters:
    X : array-like, shape (n_samples, n_features)
        Training data.
    y : array-like, shape (n_samples,)
        Target values.
    A : int
        Number of components.

    Returns:
    B : array, shape (n_features, A)
        Coefficients for each component.
    """
    s = X.T.dot(y)
    R = np.zeros((X.shape[1], A))
    TT = np.zeros((X.shape[0], A))
    P = np.zeros((X.shape[1], A))
    U = np.zeros((X.shape[0], A))
    V = np.zeros((X.shape[1], A))
    B = np.zeros((X.shape[1], A))
    Q = np.zeros((1, A))

    for i in range(A):
        q = s.T.dot(s)
        r = s.dot(q)
        t = X.dot(r)
        t = t - np.mean(t)
        normt = np.sqrt(t.T.dot(t))
        t = t / normt
        r = r / normt
        p = X.T.dot(t)
        q = y.T.dot(t)
        u = y * q
        v = p
        if i > 0:
            v = v - V[:, :i+1].dot(V[:, :i+1].T.dot(p))
            u = u - TT[:, :i+1].dot(TT[:, :i+1].T.dot(u))
        v = v / np.sqrt(v.T.dot(v))
        s = s - v.dot(v.T.dot(s))

        R[:, i] = r
        TT[:, i] = t
        P[:, i] = p
        U[:, i] = u
        V[:, i] = v
        Q[:, i] = q

    for i in range(A - 1):
        C = R[:, :i+1].dot(Q[:, :i+1].T)
        B[:, i+1] = C[:, 0]

    return B

def soft_threshodl(groups, nc, w, mu):
    """
    Soft thresholding operator

    Parameters:
    groups : int
        Not used in the function, placeholder for the MATLAB code.
    nc : int
        Not used in the function, placeholder for the MATLAB code.
    w : array-like
        Input array.
    mu : float
        Threshold parameter.

    Returns:
    val : array-like
        Soft thresholded array.
    """
    val = np.sign(w) * np.maximum(np.abs(w) - mu, 0)
    return val

def lossh(y, yhat, mu):
    """
    Loss function for proximalH

    Parameters:
    y : array-like
        True target values.
    yhat : array-like
        Predicted values.
    mu : float
        Threshold parameter.

    Returns:
    m : float
        Loss value.
    """
    r = np.abs(yhat - y)
    l = np.zeros(len(r))
    ind = (r > mu)
    l[ind] = 2 * mu * r[ind] - mu * mu
    ind = (r <= mu)
    l[ind] = r[ind] * r[ind]
    m = np.mean(l)
    return m

def f_gradh(w, X, y, mu):
    """
    Gradient of the loss function for proximalH

    Parameters:
    w : array-like
        Coefficients.
    X : array-like
        Training data.
    y : array-like
        Target values.
    mu : float
        Threshold parameter.

    Returns:
    grad : array-like
        Gradient.
    """
    r = np.dot(X, w) - y
    ind0 = np.where(np.abs(r) <= mu)[0]
    ind1 = np.where(r > mu)[0]
    indf1 = np.where(r < -mu)[0]
    grad = np.dot(X[ind0, :].T, np.dot(X[ind0, :], w) - y[ind0]) + mu * np.dot(X[ind1, :].T, np.ones(len(ind1))) - mu * np.dot(X[indf1, :].T, np.ones(len(indf1)))
    return grad

def proximalH(groups, nc, xtest, mtrain, ytest, w, X, y, mu, tol, L, l2, func):
    """
    Proximal operator using accelerated proximal gradient descent

    Parameters:
    groups : int
        Not used in the function, placeholder for the MATLAB code.
    nc : int
        Not used in the function, placeholder for the MATLAB code.
    xtest : array-like
        Test data.
    mtrain : float
        Mean of the training target values.
    ytest : array-like
        Test target values.
    w : array-like
        Initial guess of the coefficients.
    X : array-like
        Training data.
    y : array-like
        Target values.
    mu : float
        Threshold parameter.
    tol : float
        Tolerance parameter for convergence.
    L : float
        Lipschitz constant.
    l2 : float
        Regularization parameter.
    func : function
        Soft thresholding function.

    Returns:
    a : array-like
        Final coefficients after proximal gradient descent.
    """
    dim = X.shape[1]
    max_iter = 3000
    gamma = 1 / L
    l1 = l2
    v = w.copy()
    yhatbig1 = np.dot(xtest, w) + mtrain
    r20 = lossh(yhatbig1, ytest, mu)

    for t in range(max_iter):
        vold = v.copy()
        w_perv = w.copy()
        w = v - gamma * f_gradh(v, X, y, mu)
        mu1 = l1 * gamma
        w = func(groups, nc, w, mu1)
        v = w + t / (t + 3) * (w - w_perv)

        if np.sum((v - vold) ** 2) < (np.sum(vold ** 2) * tol) or np.sum(np.abs(v - vold)) == 0:
            break

    return v

def proximal(groups, nc, XX, XY, tol, L, l2, func):
    dim = XX.shape[0]
    max_iter = 30000
    gamma = 1 / L
    l1 = l2
    w = np.zeros(dim)
    v = w.copy()

    for t in range(max_iter):
        vold = v.copy()
        w_prev = w.copy()
        w = v - gamma * f_grad(XX, XY, v)
        w = func(groups, nc, w, l1 * gamma)
        v = w + t / (t + 3) * (w - w_prev)
        if (np.sum(np.power(v - vold, 2)) < (np.sum(np.power(vold, 2)) * tol)) or (np.sum(np.abs(v - vold)) == 0):
            break

    return v

def f_grad(XX, XY, w):
    """
    Gradient of the objective function.

    Parameters:
    XX (array): Design matrix.
    XY (array): Target values.
    w (array): Coefficients.

    Returns:
    grad (array): Gradient.
    """
    grad = np.dot(XX, w) - XY
    return grad

def soft_threshodr(groups, nc, w, mu):
    """
    Soft thresholding function for ridge regularization.

    Parameters:
    groups (array): Groups.
    nc (int): Number of components.
    w (array): Coefficients.
    mu (float): Threshold parameter.

    Returns:
    val (array): Updated coefficients after soft thresholding.
    """
    val = w / (1 + mu)
    return val

def cut_knots_degree2(x, n, th):
    """
    cut_knots_degree2
    """
    a, b = x.shape
    resultfinal = np.zeros((a, b * (n + 1)))

    for i in range(b):
        xcut = x[:, i]
        xcutnona = np.copy(xcut)
        xcutnona[np.isnan(xcutnona)] = 0
        index = ((1 - 1 * np.isnan(xcut)) == 1)

        t = th[:, i]

        x1 = xcutnona
        resultfinal[:, (n + 1) * i - n] = x1 - np.mean(x1)
        x1 = np.power(xcutnona - t[0], 2)
        resultfinal[:, (n + 1) * i - n + 1] = x1 - np.mean(x1)

        for j in range(n - 1):
            x1 = np.power(xcutnona - t[j + 1], 2) * (xcutnona >= t[j + 1])
            resultfinal[:, (n + 1) * i - n + 1 + j] = x1 - np.mean(x1)

    return resultfinal

def soft_threshode(groups, nc, w, mu):
    # soft_threshode
    val = np.sign(w) * np.maximum(np.abs(w) - 0.5 * mu, 0) / (1 + 0.5 * mu)
    return val

def soft_threshodg(groups, nc, w, mu):
    w1 = np.copy(w)
    for i in range(1, nc + 1):
        ind = (groups == i)
        wg = w1[ind]
        nn = len(wg)
        n2 = np.sqrt(np.sum(wg ** 2))
        if n2 <= mu:
            w1[ind] = np.zeros(nn)
        else:
            w1[ind] = wg - mu * wg / n2
    return w1

Get to the models

In [6]:
MC = 1  # setup MC number
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
# hh = [1, 3, 6, 12]  # correspond to monthly, quarterly, half-year, and annually returns
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 180   # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln - 1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

        # PCR
        modeln += 1
        ne = 30
        X = np.dot(xtrain.T, xtrain)
        pca_vec = V.T
        p1 = pca_vec[:, :ne]
        Z = np.dot(xtrain, p1)
        r = np.zeros((3, ne))
        B = np.zeros((xtrain.shape[1], ne))
        Y = ytrain_demean

        for j in range(ne - 1):
            xx = Z[:, :j + 1]
            b = np.dot(np.linalg.inv(np.dot(xx.T, xx)), np.dot(xx.T, Y))
            b = np.dot(p1[:, :j + 1], b)

            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1 - ytest, 2)) / np.sum(np.power(ytest - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
            B[:, j] = b

        b = np.zeros(xtest.shape[1])
        j = ne - 1
        yhatbig1 = np.dot(xtest, b) + mtrain
        r[0, j] = 1 - np.sum(np.power(yhatbig1 - ytest, 2)) / np.sum(np.power(ytest - mtrain, 2))
        yhatbig1 = np.dot(xoos, b) + mtrain
        r[1, j] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r[2, j] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        B[:, j] = b

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln - 1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PCR R2: {r2_oos[modeln - 1]:.3f}")

        # PLS
        modeln += 1
        B = pls(xtrain, ytrain_demean, 30)
        ne = 30
        r = np.zeros((3, ne))
        Y = ytrain_demean

        for j in range(ne):
            b = B[:, j]
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1 - ytest, 2)) / np.sum(np.power(ytest - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln - 1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PLS R2: {r2_oos[modeln - 1]:.3f}")


        # Elastic Net
        modeln += 1
        lamv = np.arange(-2, 4.1, 0.1)
        alpha = 0.5
        r = np.zeros((3, len(lamv)))

        for j in range(len(lamv)):
            l2 = 10 ** lamv[j]
            func = soft_threshode
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1 - ytest, 2)) / np.sum(np.power(ytest - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        l2 = 10 ** lamv[int(fw1(r[0]))]
        func = soft_threshode
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet R2 :', np.round(r2_oos[modeln-1], 3))

        modeln += 1
        func = soft_threshode
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln-1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln-1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln - 1}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet+H R2 :', np.round(r2_oos[modeln-1], 3))

        # Group Lasso
        kn = 4
        th = np.zeros((kn, xtrain.shape[1]))
        th[1, :] = 0

        for i in range(xtrain.shape[1]):
            th[:, i] = np.quantile(xtrain[:, i], np.arange(kn) / kn)

        xtrain = cut_knots_degree2(xtrain, kn, th)
        xtest = cut_knots_degree2(xtest, kn, th)
        xoos = cut_knots_degree2(xoos, kn, th)

        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] = xtrain[:, i] / s
                xtest[:, i] = xtest[:, i] / s
                xoos[:, i] = xoos[:, i] / s

        Y = ytrain_demean
        XX = xtrain.T @ xtrain
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        XY = xtrain.T @ Y

        modeln += 1
        lamv = np.arange(0.5, 3.1, 0.1)
        nc = XX.shape[1] // (kn + 1)
        groups = np.repeat(np.arange(1, nc + 1), kn + 1)
        r = np.zeros((3, len(lamv)))

        for j, lam in enumerate(lamv):
            l2 = 10 ** lam
            func = soft_threshodg
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = xtest @ b + mtrain
            r[0, j] = 1 - np.sum((yhatbig1 - ytest) ** 2) / np.sum((ytest - mtrain) ** 2)
            yhatbig1 = xoos @ b + mtrain
            r[1, j] = 1 - np.sum((yhatbig1 - yoos) ** 2) / np.sum((yoos - mtrain) ** 2)
            yhatbig1 = xtrain @ b + mtrain
            r[2, j] = 1 - np.sum((yhatbig1 - ytrain) ** 2) / np.sum((ytrain - mtrain) ** 2)

        r2_oos[modeln - 1] = r[1, np.int16(fw1(r[0, :]))]
        r2_is[modeln - 1] = r[2, np.int16(fw1(r[0, :]))]
        l2 = 10 ** lamv[np.int16(fw1(r[0, :]))]

        func = soft_threshodg
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln - 1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso R2:", np.round(r2_oos[modeln - 1], 3))

        modeln += 1
        func = soft_threshodg
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = xoos @ b + mtrain
        r2_oos[modeln -1] = 1 - np.sum((yhatbig1 - yoos) ** 2) / np.sum((yoos - mtrain) ** 2)
        yhatbig1 = xtrain @ b + mtrain
        r2_is[modeln - 1] = 1 - np.sum((yhatbig1 - ytrain) ** 2) / np.sum((ytrain - mtrain) ** 2)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln - 1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso+H R2:", np.round(r2_oos[modeln - 1], 3))

        pathr = f"{title}/roos"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_oos, delimiter=",")

        pathr = f"{title}/ris"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_is, delimiter=",")

### MCMC : 1, Model : 1 ###
Simple OLS R2: 0.019
Simple OLS+H R2: 0.020
PCR R2: 0.008
PLS R2: 0.030
Enet R2 : 0.04
Enet+H R2 : 0.04
Group Lasso R2: 0.04
Group Lasso+H R2: 0.04


In [7]:
r2_oos

array([0.01945444, 0.02013166, 0.00830992, 0.02968415, 0.04022508,
       0.04003227, 0.03963787, 0.03969461, 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [8]:
output = pd.DataFrame(columns = ["OLS + H", "OLS-3 + H", "PCR", "PLS", "Enet + H", "GLM + H", "RF", "GBRT + H"], data=np.zeros((3,8)))
output.index = ["All", "Top 1000", "Bottom 1000"]
output.iloc[0,0] =r2_oos[1]
output.iloc[0,2] = r2_oos[2]
output.iloc[0,3] = r2_oos[3]
output.iloc[0,4] = r2_oos[5]
output.iloc[0,5] = r2_oos[7]

In [9]:
output

Unnamed: 0,OLS + H,OLS-3 + H,PCR,PLS,Enet + H,GLM + H,RF,GBRT + H
All,0.020132,0.0,0.00831,0.029684,0.040032,0.039695,0.0,0.0
Top 1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bottom 1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Ensemble Models

In [10]:
MC = 1  # setup MC number
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = os.path.join(path, f'SimuData_p{datanum}/')
hh = [1]
mo = 1
# for hh in [1, 3, 6, 12]:  # correspond to monthly quarterly halfyear and annually returns
title = os.path.join(path, f'Simu_p{datanum}/Tree{hh}')

if not os.path.isdir(title) and MC == 1:
    os.makedirs(title)

titleB = os.path.join(title, 'B')
if not os.path.isdir(titleB) and MC == 1:
    os.makedirs(titleB)

if datanum == 50:
    nump = 50
if datanum == 100:
    nump = 100
N = 200  # Number of CS tickers
m = nump * 2  # Number of Characteristics
T = 180  # Number of Time Periods

per = np.tile(np.arange(1, N + 1), T)
time = np.repeat(np.arange(1, T + 1), N)
stdv = 0.05
theta_w = 0.005

# Read Files
path1 = f"{dirstock}c{M}.csv"
path2 = f"{dirstock}r{mo}_{M}.csv"
c = np.genfromtxt(path1, delimiter=',')
r1 = np.genfromtxt(path2, delimiter=',')

# Add Some Elements
daylen = np.tile(N, T // 3)
daylen_test = daylen
ind = np.arange(0, int(N * T / 3))
xtrain = c[ind, :]
ytrain = r1[ind]
trainper = per[ind]
ind = np.arange(int(N * T / 3), int(N * (T * 2 / 3 + 1)))
xtest = c[ind, :]
ytest = r1[ind]
testper = per[ind]

l1 = c.shape[0]
l2 = len(r1)
l3 = l2 - np.sum(np.isnan(r1))

ind = np.arange(int(N * T * 2 / 3), min(l1, l2, l3))
xoos = c[ind, :]
yoos = r1[ind]

# Monthly Demean
ytrain_demean = ytrain - np.mean(ytrain)
ytest_demean = ytest - np.mean(ytest)
mtrain = np.mean(ytrain)
mtest = np.mean(ytest)

# Start to train
r2_oos = np.zeros(3)  # OOS R2
r2_is = np.zeros(3)  # IS R2

# Random Forest
if nump == 50:
    lamv = np.arange(10, 101, 10)
elif nump == 100:
    lamv = np.arange(10, 201, 20)
ne = 100
lamc = [2, 4, 8, 16, 32]
r = np.zeros((len(lamv), len(lamc), 3))

for n1 in tqdm(range(len(lamv))):
    nf = lamv[n1]
    for n2 in range(len(lamc)):
        nn = lamc[n2]
        clf = RandomForestRegressor(
            n_estimators=ne,
            max_features=nf,
            max_depth=nn
        )
        clf.fit(xtrain, ytrain)
        yhatbig1 = clf.predict(xtest)
        r[n1, n2, 0] = 1 - np.sum(np.power(yhatbig1 - ytest, 2)) / np.sum(
        np.power(ytest - mtrain, 2))
        yhatbig1 = clf.predict(xoos)
        r[n1, n2, 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(
        np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain)
        r[n1, n2, 2] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(
        np.power(ytrain - mtrain, 2))

fw_2 = np.unravel_index(np.argmax(r[:, :, 0]), r[:, :, 0].shape)
r2_oos[0] = r[fw_2[0], fw_2[1], 1]
r2_is[0] = r[fw_2[0], fw_2[1], 2]
print(f"RF R2 : {r2_oos[0]:.3f}")

# Save r2_oos and r2_is to files
pathr = f"{title}/roos_{mo}_{M}.csv"
np.savetxt(pathr, r2_oos.reshape(1, -1), delimiter=",")
pathr = f"{title}/ris_{mo}_{M}.csv"
np.savetxt(pathr, r2_is.reshape(1, -1), delimiter=",")

100%|██████████| 10/10 [30:58<00:00, 185.88s/it]

RF R2 : 0.039





In [17]:
output.iloc[0,6] = r2_oos[0]

In [12]:
#Gradient Boosting
mo = 1

lamv = np.arange(-1, 0.1, 0.2)
r = np.zeros((len(lamv), 50, 3))

for n1 in range(len(lamv)):
    lr = 10 ** lamv[n1]
    alpha = 2
    ne = 50
    clf = GradientBoostingRegressor(
        n_estimators=ne,
        learning_rate=lr,
        loss='huber',
        max_depth=2
    )

    clf.fit(xtrain, ytrain)
    e = clf.staged_predict(xtest)
    for i, pred in enumerate(e):
        r[n1, i, 0] = np.mean((pred - ytest) ** 2)

    e = clf.staged_predict(xoos)
    for i, pred in enumerate(e):
        r[n1, i, 1] = np.mean((pred - yoos) ** 2)

    e = clf.staged_predict(xtrain)
    for i, pred in enumerate(e):
        r[n1, i, 2] = np.mean((pred - ytrain) ** 2)

fw_2 = np.unravel_index(np.argmin(r[:, :, 0]), r[:, :, 0].shape)
err1 = np.mean((ytrain - mtrain) ** 2)
err2 = np.mean((yoos - mtrain) ** 2)
r2_oos[1] = 1 - r[fw_2[0], fw_2[1], 1] / err2
r2_is[1] = 1 - r[fw_2[0], fw_2[1], 2] / err1
print(f"GBRT R2 : {r2_oos[1]:.3f}")

# Save r2_oos and r2_is to files
pathr = f"{title}/roos_{mo}_{M}.csv"
np.savetxt(pathr, r2_oos.reshape(1, -1), delimiter=",")
pathr = f"{title}/ris_{mo}_{M}.csv"
np.savetxt(pathr, r2_is.reshape(1, -1), delimiter=",")

GBRT R2 : 0.043


In [18]:
output.iloc[0,7] = r2_oos[1]

# Additional Manipulations to get top-1000 and bottom-1000 of simulated stocks

In [31]:
x_c = pd.read_csv("./Simu/SimuData_p50/c1.csv")
x_b = pd.read_csv("./Simu/SimuData_p50/r1_1.csv")

In [46]:
updated = pd.merge(x_c, x_b, left_index=True, right_index=True, how='inner')
#assume that the 4th column is the Market Value of the stock. Proceeding to arrange it in accordance to the column values
updated = updated.sort_values(by=updated.columns[3], ascending=False)
rtop_1 = updated.iloc[:,:1]
ctop1 = updated.iloc[:,1:]


In [47]:
rtop_1.to_csv('./Simu/SimuData_p50/r1_1.csv', index=False, header=False)
ctop1.to_csv('./Simu/SimuData_p50/c1.csv', index=False, header=False)

Same simulations for top 1000

In [48]:
MC = 1  # setup MC number
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
# hh = [1, 3, 6, 12]  # correspond to monthly, quarterly, half-year, and annually returns
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 4  # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

        # PCR
        modeln += 1
        ne = 30
        X = np.dot(xtrain.T, xtrain)
        pca_vec = V.T
        p1 = pca_vec[:, :ne]
        Z = np.dot(xtrain, p1)
        r = np.zeros((3, ne))
        B = np.zeros((xtrain.shape[1], ne))
        Y = ytrain_demean

        for j in range(ne - 1):
            xx = Z[:, :j + 1]
            b = np.dot(np.linalg.inv(np.dot(xx.T, xx)), np.dot(xx.T, Y))
            b = np.dot(p1[:, :j + 1], b)

            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytest[:1000], 2)) / np.sum(np.power(ytest[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
            B[:, j] = b

        b = np.zeros(xtest.shape[1])
        j = ne - 1
        yhatbig1 = np.dot(xtest, b) + mtrain
        r[0, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytest[:1000], 2)) / np.sum(np.power(ytest[:1000] - mtrain, 2))
        yhatbig1 = np.dot(xoos, b) + mtrain
        r[1, j] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r[2, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
        B[:, j] = b

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PCR R2: {r2_oos[modeln - 1]:.3f}")

        # PLS
        modeln += 1
        B = pls(xtrain, ytrain_demean, 30)
        ne = 30
        r = np.zeros((3, ne))
        Y = ytrain_demean

        for j in range(ne):
            b = B[:, j]
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytest[:1000], 2)) / np.sum(np.power(ytest[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln - 1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PLS R2: {r2_oos[modeln - 1]:.3f}")

        # Elastic Net
        modeln += 1
        lamv = np.arange(-2, 4.1, 0.1)
        alpha = 0.5
        r = np.zeros((3, len(lamv)))

        for j in range(len(lamv)):
            l2 = 10 ** lamv[j]
            func = soft_threshode
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytest[:1000], 2)) / np.sum(np.power(ytest[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        l2 = 10 ** lamv[int(fw1(r[0]))]
        func = soft_threshode
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet R2 :', np.round(r2_oos[modeln-1], 3))

        modeln += 1
        func = soft_threshode
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln-1] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln-1] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet+H R2 :', np.round(r2_oos[modeln-1], 3))

        # Group Lasso
        kn = 4
        th = np.zeros((kn, xtrain.shape[1]))
        th[1, :] = 0

        for i in range(xtrain.shape[1]):
            th[:, i] = np.quantile(xtrain[:, i], np.arange(kn) / kn)

        xtrain = cut_knots_degree2(xtrain, kn, th)
        xtest = cut_knots_degree2(xtest, kn, th)
        xoos = cut_knots_degree2(xoos, kn, th)

        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] = xtrain[:, i] / s
                xtest[:, i] = xtest[:, i] / s
                xoos[:, i] = xoos[:, i] / s

        Y = ytrain_demean
        XX = xtrain.T @ xtrain
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        XY = xtrain.T @ Y

        modeln += 1
        lamv = np.arange(0.5, 3.1, 0.1)
        nc = XX.shape[1] // (kn + 1)
        groups = np.repeat(np.arange(1, nc + 1), kn + 1)
        r = np.zeros((3, len(lamv)))

        for j, lam in enumerate(lamv):
            l2 = 10 ** lam
            func = soft_threshodg
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = xtest @ b + mtrain
            r[0, j] = 1 - np.sum((yhatbig1 - ytest) ** 2) / np.sum((ytest - mtrain) ** 2)
            yhatbig1 = xoos @ b + mtrain
            r[1, j] = 1 - np.sum((yhatbig1 - yoos) ** 2) / np.sum((yoos - mtrain) ** 2)
            yhatbig1 = xtrain @ b + mtrain
            r[2, j] = 1 - np.sum((yhatbig1 - ytrain) ** 2) / np.sum((ytrain - mtrain) ** 2)

        r2_oos[modeln-1] = r[1, np.int16(fw1(r[0, :]))]
        r2_is[modeln-1] = r[2, np.int16(fw1(r[0, :]))]
        l2 = 10 ** lamv[np.int16(fw1(r[0, :]))]

        func = soft_threshodg
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso R2:", np.round(r2_oos[modeln-1], 3))

        modeln += 1
        func = soft_threshodg
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = xoos @ b + mtrain
        r2_oos[modeln-1] = 1 - np.sum((yhatbig1[:1000] - yoos[:1000]) ** 2) / np.sum((yoos[:1000] - mtrain) ** 2)
        yhatbig1 = xtrain @ b + mtrain
        r2_is[modeln-1] = 1 - np.sum((yhatbig1[:1000] - ytrain[:1000]) ** 2) / np.sum((ytrain[:1000] - mtrain) ** 2)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso+H R2:", np.round(r2_oos[modeln-1], 3))

        pathr = f"{title}/roos"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_oos, delimiter=",")

        pathr = f"{title}/ris"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_is, delimiter=",")

### MCMC : 1, Model : 1 ###
Simple OLS R2: -6.674
Simple OLS+H R2: -3.348
PCR R2: 0.107
PLS R2: 0.000
Enet R2 : 0.161
Enet+H R2 : -0.013
Group Lasso R2: -0.273
Group Lasso+H R2: 0.099


In [49]:
output.iloc[1,0] =r2_oos[1]
output.iloc[1,2] = r2_oos[2]
output.iloc[1,3] = r2_oos[3]
output.iloc[1,4] = r2_oos[5]
output.iloc[1,5] = r2_oos[7]

And for the bottom 1000

In [50]:
MC = 1  # setup MC number
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
# hh = [1, 3, 6, 12]  # correspond to monthly, quarterly, half-year, and annually returns
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 4  # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln -1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:]- mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:]- mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

        # PCR
        modeln += 1
        ne = 30
        X = np.dot(xtrain.T, xtrain)
        pca_vec = V.T
        p1 = pca_vec[:, :ne]
        Z = np.dot(xtrain, p1)
        r = np.zeros((3, ne))
        B = np.zeros((xtrain.shape[1], ne))
        Y = ytrain_demean

        for j in range(ne - 1):
            xx = Z[:, :j + 1]
            b = np.dot(np.linalg.inv(np.dot(xx.T, xx)), np.dot(xx.T, Y))
            b = np.dot(p1[:, :j + 1], b)

            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytest[-1000:], 2)) / np.sum(np.power(ytest[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))
            B[:, j] = b

        b = np.zeros(xtest.shape[1])
        j = ne - 1
        yhatbig1 = np.dot(xtest, b) + mtrain
        r[0, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytest[-1000:], 2)) / np.sum(np.power(ytest[-1000:] - mtrain, 2))
        yhatbig1 = np.dot(xoos, b) + mtrain
        r[1, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r[2, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))
        B[:, j] = b

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PCR R2: {r2_oos[modeln - 1]:.3f}")

        # PLS
        modeln += 1
        B = pls(xtrain, ytrain_demean, 30)
        ne = 30
        r = np.zeros((3, ne))
        Y = ytrain_demean

        for j in range(ne):
            b = B[:, j]
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytest[-1000:], 2)) / np.sum(np.power(ytest[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        b = B[:, fw1(r[0, :].tolist())]
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"PLS R2: {r2_oos[modeln - 1]:.3f}")

        # Elastic Net
        modeln += 1
        lamv = np.arange(-2, 4.1, 0.1)
        alpha = 0.5
        r = np.zeros((3, len(lamv)))

        for j in range(len(lamv)):
            l2 = 10 ** lamv[j]
            func = soft_threshode
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = np.dot(xtest, b) + mtrain
            r[0, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytest[-1000:], 2)) / np.sum(np.power(ytest[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xoos, b) + mtrain
            r[1, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
            yhatbig1 = np.dot(xtrain, b) + mtrain
            r[2, j] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))

        r2_oos[modeln - 1] = r[1, fw1(r[0, :].tolist())]
        r2_is[modeln - 1] = r[2, fw1(r[0, :].tolist())]
        l2 = 10 ** lamv[int(fw1(r[0]))]
        func = soft_threshode
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet R2 :', np.round(r2_oos[modeln-1], 3))

        modeln += 1
        func = soft_threshode
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln-1] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln-1] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        np.savetxt(pathb, b, delimiter=',')
        print('Enet+H R2 :', np.round(r2_oos[modeln-1], 3))

        # Group Lasso
        kn = 4
        th = np.zeros((kn, xtrain.shape[1]))
        th[1, :] = 0

        for i in range(xtrain.shape[1]):
            th[:, i] = np.quantile(xtrain[:, i], np.arange(kn) / kn)

        xtrain = cut_knots_degree2(xtrain, kn, th)
        xtest = cut_knots_degree2(xtest, kn, th)
        xoos = cut_knots_degree2(xoos, kn, th)

        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] = xtrain[:, i] / s
                xtest[:, i] = xtest[:, i] / s
                xoos[:, i] = xoos[:, i] / s

        Y = ytrain_demean
        XX = xtrain.T @ xtrain
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        XY = xtrain.T @ Y

        modeln += 1
        lamv = np.arange(0.5, 3.1, 0.1)
        nc = XX.shape[1] // (kn + 1)
        groups = np.repeat(np.arange(1, nc + 1), kn + 1)
        r = np.zeros((3, len(lamv)))

        for j, lam in enumerate(lamv):
            l2 = 10 ** lam
            func = soft_threshodg
            b = proximal(groups, nc, XX, XY, tol, L, l2, func)
            yhatbig1 = xtest @ b + mtrain
            r[0, j] = 1 - np.sum((yhatbig1 - ytest) ** 2) / np.sum((ytest - mtrain) ** 2)
            yhatbig1 = xoos @ b + mtrain
            r[1, j] = 1 - np.sum((yhatbig1 - yoos) ** 2) / np.sum((yoos - mtrain) ** 2)
            yhatbig1 = xtrain @ b + mtrain
            r[2, j] = 1 - np.sum((yhatbig1 - ytrain) ** 2) / np.sum((ytrain - mtrain) ** 2)

        r2_oos[modeln-1] = r[1, np.int16(fw1(r[0, :]))]
        r2_is[modeln-1] = r[2, np.int16(fw1(r[0, :]))]
        l2 = 10 ** lamv[np.int16(fw1(r[0, :]))]

        func = soft_threshodg
        b = proximal(groups, nc, XX, XY, tol, L, l2, func)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso R2:", np.round(r2_oos[modeln-1], 3))

        modeln += 1
        func = soft_threshodg
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, l2, func)
        yhatbig1 = xoos @ b + mtrain
        r2_oos[modeln-1] = 1 - np.sum((yhatbig1[-1000:] - yoos[-1000:]) ** 2) / np.sum((yoos[-1000:] - mtrain) ** 2)
        yhatbig1 = xtrain @ b + mtrain
        r2_is[modeln-1] = 1 - np.sum((yhatbig1[-1000:] - ytrain[-1000:]) ** 2) / np.sum((ytrain[-1000:] - mtrain) ** 2)
        pathb = f"{title}/B/b"
        pathb = f"{pathb}_{mo}_{M}_{modeln-1}.csv"
        np.savetxt(pathb, b, delimiter=",")

        print("Group Lasso+H R2:", np.round(r2_oos[modeln-1], 3))

        pathr = f"{title}/roos"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_oos, delimiter=",")

        pathr = f"{title}/ris"
        pathr = f"{pathr}_{mo}_{M}.csv"
        np.savetxt(pathr, r2_is, delimiter=",")

### MCMC : 1, Model : 1 ###
Simple OLS R2: -6.674
Simple OLS+H R2: -7.659
PCR R2: -0.204
PLS R2: 0.000
Enet R2 : -0.157
Enet+H R2 : 0.036
Group Lasso R2: -0.273
Group Lasso+H R2: -0.121


In [51]:
output.iloc[2,0] =r2_oos[1]
output.iloc[2,2] = r2_oos[2]
output.iloc[2,3] = r2_oos[3]
output.iloc[2,4] = r2_oos[5]
output.iloc[2,5] = r2_oos[7]

Now moving to boosing models

In [52]:
MC = 1  # setup MC number
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = os.path.join(path, f'SimuData_p{datanum}/')
hh = [1]
mo = 1
# for hh in [1, 3, 6, 12]:  # correspond to monthly quarterly halfyear and annually returns
title = os.path.join(path, f'Simu_p{datanum}/Tree{hh}')

if not os.path.isdir(title) and MC == 1:
    os.makedirs(title)

titleB = os.path.join(title, 'B')
if not os.path.isdir(titleB) and MC == 1:
    os.makedirs(titleB)

if datanum == 50:
    nump = 50
if datanum == 100:
    nump = 100
N = 200  # Number of CS tickers
m = nump * 2  # Number of Characteristics
T = 180  # Number of Time Periods

per = np.tile(np.arange(1, N + 1), T)
time = np.repeat(np.arange(1, T + 1), N)
stdv = 0.05
theta_w = 0.005

# Read Files
path1 = f"{dirstock}c{M}.csv"
path2 = f"{dirstock}r{mo}_{M}.csv"
c = np.genfromtxt(path1, delimiter=',')
r1 = np.genfromtxt(path2, delimiter=',')

# Add Some Elements
daylen = np.tile(N, T // 3)
daylen_test = daylen
ind = np.arange(0, int(N * T / 3))
xtrain = c[ind, :]
ytrain = r1[ind]
trainper = per[ind]
ind = np.arange(int(N * T / 3), int(N * (T * 2 / 3 + 1)))
xtest = c[ind, :]
ytest = r1[ind]
testper = per[ind]

l1 = c.shape[0]
l2 = len(r1)
l3 = l2 - np.sum(np.isnan(r1))

ind = np.arange(int(N * T * 2 / 3), min(l1, l2, l3))
xoos = c[ind, :]
yoos = r1[ind]

# Monthly Demean
ytrain_demean = ytrain - np.mean(ytrain)
ytest_demean = ytest - np.mean(ytest)
mtrain = np.mean(ytrain)
mtest = np.mean(ytest)

# Start to train
r2_oos = np.zeros(3)  # OOS R2
r2_is = np.zeros(3)
r3_oos = np.zeros(3) # IS R2

# Random Forest
if nump == 50:
    lamv = np.arange(10, 101, 10)
elif nump == 100:
    lamv = np.arange(10, 201, 20)
ne = 100
lamc = [2, 4, 8, 16, 32]
r = np.zeros((len(lamv), len(lamc), 3))
rrr = np.zeros((len(lamv), len(lamc), 3))

for n1 in tqdm(range(len(lamv))):
    nf = lamv[n1]
    for n2 in range(len(lamc)):
        nn = lamc[n2]
        clf = RandomForestRegressor(
            n_estimators=ne,
            max_features=nf,
            max_depth=nn
        )
        clf.fit(xtrain, ytrain)
        yhatbig1 = clf.predict(xtest)

        r[n1, n2, 0] = 1 - np.sum(np.power(yhatbig1[:1000] - ytest[:1000], 2)) / np.sum(
        np.power(ytest[:1000] - mtrain, 2))
        yhatbig1 = clf.predict(xoos)
        r[n1, n2, 1] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(
        np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = clf.predict(xtrain)
        r[n1, n2, 2] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(
        np.power(ytrain[:1000] - mtrain, 2))

        rrr[n1, n2, 0] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytest[-1000:], 2)) / np.sum(
        np.power(ytest[-1000:] - mtrain, 2))
        yhatbig1 = clf.predict(xoos)
        rrr[n1, n2, 1] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(
        np.power(yoos[-1000:] - mtrain, 2))
        yhatbig1 = clf.predict(xtrain)
        rrr[n1, n2, 2] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(
        np.power(ytrain[-1000:] - mtrain, 2))

fw_2 = np.unravel_index(np.argmax(r[:, :, 0]), r[:, :, 0].shape)
fw_3 = np.unravel_index(np.argmax(rrr[:, :, 0]), rrr[:, :, 0].shape)

r2_oos[0] = r[fw_2[0], fw_2[1], 1]
r3_oos[0] = r[fw_3[0], fw_3[1], 1]
print(f"RF R2 for first 1000 : {r2_oos[0]:.3f}, RF R2 for last 1000 : {r3_oos[0]:.3f}")

100%|██████████| 10/10 [29:30<00:00, 177.08s/it]

RF R2 for first 1000 : -0.577, RF R2 for last 1000 : 0.011





In [53]:
output.iloc[1,6] =r2_oos[0]
output.iloc[2,6] =r3_oos[0]

In [54]:
#Gradient Boosting
mo = 1

lamv = np.arange(-1, 0.1, 0.2)
r = np.zeros((len(lamv), 50, 3))
rrr = np.zeros((len(lamv), 50, 3))

for n1 in range(len(lamv)):
    lr = 10 ** lamv[n1]
    alpha = 2
    ne = 50
    clf = GradientBoostingRegressor(
        n_estimators=ne,
        learning_rate=lr,
        loss='huber',
        max_depth=2
    )

    clf.fit(xtrain, ytrain)
    e = clf.staged_predict(xtest)
    for i, pred in enumerate(e):
        r[n1, i, 0] = np.mean((pred - ytest) ** 2)

    e = clf.staged_predict(xoos)
    for i, pred in enumerate(e):
        r[n1, i, 1] = np.mean((pred - yoos) ** 2)

    e = clf.staged_predict(xtrain)
    for i, pred in enumerate(e):
        r[n1, i, 2] = np.mean((pred[:1000] - ytrain[:1000]) ** 2)
        rrr[n1, i, 2] = np.mean((pred[-1000:] - ytrain[-1000:]) ** 2)


fw_2 = np.unravel_index(np.argmin(r[:, :, 0]), r[:, :, 0].shape)
fw_3 = np.unravel_index(np.argmin(rrr[:, :, 0]), rrr[:, :, 0].shape)
err2 = np.mean((yoos[:1000] - mtrain) ** 2)
err3 = np.mean((yoos[-1000:] - mtrain) ** 2)
r2_oos[1] = 1 - r[fw_2[0], fw_2[1], 1] / err2
r3_oos[1] = 1 - r[fw_3[0], fw_3[1], 1] / err3
print(f"GBRT R2 for the first 1000 : {r2_oos[1]:.3f}, GBRT R2 for last 1000 : {r3_oos[1]:.3f}")


GBRT R2 for the first 1000 : 0.019, GBRT R2 for last 1000 : 0.013


In [55]:
output.iloc[1,7] =r2_oos[1]
output.iloc[2,7] =r3_oos[1]

Special Case: OLS-3

In [69]:
updated = merged_dataset = pd.merge(x_c, x_b, left_index=True, right_index=True, how='inner')
#assume that the 4th column is the Market Value of the stock. Proceeding to arrange it in accordance to the column values
updated = updated.sort_values(by=updated.columns[3], ascending=False)
rtop_1 = updated.iloc[:,:1]
ctop1 = updated.iloc[:,1:4]

rtop_1.to_csv('./Simu/SimuData_p50/r1_1.csv', index=False, header=False)
ctop1.to_csv('./Simu/SimuData_p50/c1.csv', index=False, header=False)

In [70]:
MC = 1
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 180   # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

### MCMC : 1, Model : 1 ###
Simple OLS R2: -0.006
Simple OLS+H R2: -0.010


In [71]:
output.iloc[0,1] = r2_oos[modeln - 1]

In [72]:
MC = 1
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 180   # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - yoos[:1000], 2)) / np.sum(np.power(yoos[:1000] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1[:1000] - ytrain[:1000], 2)) / np.sum(np.power(ytrain[:1000] - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

### MCMC : 1, Model : 1 ###
Simple OLS R2: -0.001
Simple OLS+H R2: -0.003


In [73]:
output.iloc[1,1] = r2_oos[modeln - 1]

In [74]:
MC = 1
datanum = 50  # Or datanum = 100; separately run two cases
path = './Simu'  # set your own folder path
dirstock = f"{path}/SimuData_p{datanum}/"

hh = [1]
title = f"{path}/Simu_p{datanum}/Reg{hh[0]}"
if not os.path.exists(title) and MC == 1:
    os.makedirs(title)
titleB = f"{title}/B"
if not os.path.exists(titleB) and MC == 1:
    os.makedirs(titleB)
if datanum == 50:
    nump = 50
elif datanum == 100:
    nump = 100
mu = 0.2 * np.sqrt(hh[0])
tol = 10**(-10)

for M in range(1, 2):  # Assuming range for M
    for mo in range(1, 2):  # Assuming range for mo

        print('### MCMC : {}, Model : {} ###'.format(M, mo))

        N = 200   # Number of CS tickers
        m = nump * 2   # Number of Characteristics
        T = 180   # Number of Time Periods

        per = np.tile(np.arange(1, N+1), T)
        time = np.repeat(np.arange(1, T+1), N)
        stdv = 0.05
        theta_w = 0.005

        # Read Files
        path1 = dirstock + 'c' + str(M) + '.csv'
        path2 = dirstock + 'r' + str(mo) + '_' + str(M) + '.csv'
        c = np.genfromtxt(path1, delimiter=',')
        r1 = np.genfromtxt(path2, delimiter=',')

        # Add Some Elements
        daylen = np.repeat(N, T//3)
        daylen_test = daylen.copy()
        ind = np.arange(0, N*T//3)
        xtrain = c[ind]
        ytrain = r1[ind]
        trainper = per[ind]
        start_idx = math.floor(N * T / 3) + 1
        end_idx = math.floor(N * (T * 2 / 3 - hh[0] + 1))
        ind = list(range(start_idx, end_idx))
        xtest = c[ind]
        ytest = r1[ind]
        testper = per[ind]

        l1 = c.shape[0]
        l2 = len(r1)
        l3 = l2 - np.isnan(r1).sum()

        ind = np.arange(N*(2*T//3), min([l1, l2, l3]))
        xoos = c[ind]
        yoos = r1[ind]

        # Monthly Demean
        ytrain_demean = ytrain - np.mean(ytrain)
        ytest_demean = ytest - np.mean(ytest)
        mtrain = np.mean(ytrain)
        mtest = np.mean(ytest)

        # Calculate Sufficient Stats
        sd = np.zeros(xtrain.shape[1])
        for i in range(xtrain.shape[1]):
            s = np.std(xtrain[:, i])
            if s > 0:
                xtrain[:, i] /= s
                xtest[:, i] /= s
                xoos[:, i] /= s
                sd[i] = s

        XX = np.dot(xtrain.T, xtrain)
        U, S, V = np.linalg.svd(XX)
        L = S[0]
        Y = ytrain_demean
        XY = np.dot(xtrain.T, Y)


        #OLS

        # Initialize arrays
        r2_oos = np.zeros(13)
        r2_is = np.zeros(13)
        modeln = 0
        groups = 0
        nc = 0

        # OLS
        modeln += 1
        clf = LinearRegression(fit_intercept=False)
        clf.fit(xtrain, ytrain_demean)
        yhatbig1 = clf.predict(xoos) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - yoos, 2)) / np.sum(np.power(yoos - mtrain, 2))
        yhatbig1 = clf.predict(xtrain) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1 - ytrain, 2)) / np.sum(np.power(ytrain - mtrain, 2))
        b = clf.coef_
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS R2: {r2_oos[modeln - 1]:.3f}")

        # OLS+H
        modeln += 1
        func = soft_threshodl
        b = proximalH(groups, nc, xtest, mtrain, ytest, b, xtrain, ytrain_demean, mu, tol, L, 0, func)
        yhatbig1 = np.dot(xoos, b) + mtrain
        r2_oos[modeln - 1] = 1 - np.sum(np.power(yhatbig1[-1000:] - yoos[-1000:], 2)) / np.sum(np.power(yoos[-1000:] - mtrain, 2))
        yhatbig1 = np.dot(xtrain, b) + mtrain
        r2_is[modeln - 1] = 1 - np.sum(np.power(yhatbig1[-1000:] - ytrain[-1000:], 2)) / np.sum(np.power(ytrain[-1000:] - mtrain, 2))
        pathb = f"{title}/B/b{mo}_{M}_{modeln}.csv"
        pd.DataFrame(b).to_csv(pathb, index=False, header=False)
        print(f"Simple OLS+H R2: {r2_oos[modeln - 1]:.3f}")

### MCMC : 1, Model : 1 ###
Simple OLS R2: -0.006
Simple OLS+H R2: 0.010


In [75]:
output.iloc[2,1] = r2_oos[modeln - 1]

In [76]:
output.to_excel("resulting_oos.xlsx")