In [2]:
import pandas as pd
import numpy as np

from pickle import load
from tqdm.auto import tqdm

In [3]:
df = pd.read_csv("data/main/master_df.csv", index_col="DATE", parse_dates=True)

with open("models/initial_models_inexp.pkl", "rb") as f:
    models = load(f)

with open("data/base/economic_indicators.pkl", "rb") as f:
    all_indicators = list(load(f).values()) + ["FFQ"]

In [4]:
for name, model in models.items():
    print(f"{name}:\n{model}\n")

Ridge:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('ridge', Ridge(alpha=0.5, random_state=42, solver='sag'))])

ElasticNet:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('elasticnet',
                 ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=42))])

KNeighborsRegressor:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsregressor',
                 KNeighborsRegressor(n_neighbors=10, weights='distance'))])

SVR:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(C=0.5, epsilon=0.2, kernel='linear'))])

AdaBoostRegressor:
AdaBoostRegressor(random_state=42)



In [5]:
df.describe()

Unnamed: 0,STLFSI,PSAVERT,NFCI,UMCSENT,BAMLH0A0HYM2,CP,WM2NS,USSLIND,FFQ,UNRATE,...,POOL,QCOM,RMD,ROST,STZ,TJX,TSCO,TYL,UNH,WST
count,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,...,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0,1392.0
mean,-0.092659,6.433226,-0.36276,85.861135,5.389862,1471.804288,10075.325359,1.275869,0.379843,5.648671,...,0.004018,0.003081,0.003548,0.003297,0.003141,0.003222,0.003811,0.0038,0.003315,0.003202
std,0.953807,3.195317,0.494997,14.044047,2.569354,689.466657,5100.004549,0.749881,0.110274,1.880617,...,0.04674,0.062089,0.050857,0.047692,0.043572,0.043105,0.052114,0.060242,0.047097,0.038294
min,-1.604,2.1,-0.82616,50.0,2.4325,489.453,3804.4,-2.65,0.159006,3.4,...,-0.24686,-0.369779,-0.269657,-0.370376,-0.297578,-0.322072,-0.309086,-0.374693,-0.442079,-0.183273
25%,-0.858,4.7,-0.62952,74.7475,3.6955,787.223288,6045.775,1.073125,0.287961,4.32375,...,-0.019487,-0.023893,-0.018661,-0.019627,-0.017182,-0.017472,-0.024137,-0.021625,-0.017983,-0.016419
50%,-0.1235,6.2,-0.509425,88.3675,4.728,1520.713692,8580.15,1.531,0.396993,5.0,...,0.002607,0.002103,0.001437,0.002604,0.002331,0.002175,0.000976,0.000731,0.002894,0.001274
75%,0.61875,7.3,-0.269185,96.37625,6.353,1885.058865,13284.825,1.72,0.455957,6.205,...,0.027618,0.028704,0.025167,0.026789,0.024154,0.024889,0.031354,0.028876,0.024886,0.022275
max,5.249,33.8,2.72308,112.0,21.094,3043.114,22050.8,2.13,0.561445,14.7,...,0.270348,0.437191,0.305898,0.256085,0.334331,0.225015,0.287037,0.426742,0.34058,0.161422


In [6]:
df.shape

(1392, 45)

In [7]:
df.isnull().sum().sum()

0

---

In [8]:
cols = df.columns
stocks = cols[~cols.isin(all_indicators)].to_list()
indicators = cols[cols.isin(all_indicators)].to_list()

len(stocks), len(indicators)

(30, 15)

$$
\text{Asymmetric Loss} = \frac{1}{N}\sum^N_{i=1}
\begin{cases}
2\times{(y_{pred_{i}} - y_{true_{i}})} \quad \text{ if} \quad y_{pred_{i}} > y_{true_{i}} \\
1\times{(y_{pred_{i}} - y_{true_{i}})} \quad \text{ otherwise}
\end{cases}$$

In [9]:
def overestimation_loss(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Custom asymmetric loss function that penalizes overestimations 
    more than underestimations. This is useful for long-only portfolio 
    optimization where overestimating returns could lead to suboptimal 
    allocations.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.

    Returns:
    - Average of computed loss.
    """
    overestimation_penalty = 2.0  # Penalty factor for overestimations
    underestimation_penalty = 1.0  # Penalty factor for underestimations
    
    loss = np.where(y_pred > y_true, 
                    overestimation_penalty * (y_pred - y_true), 
                    underestimation_penalty * (y_true - y_pred))
    
    return np.mean(loss)

In [36]:
ol = {name: pd.DataFrame() for name in models.keys()}
for data in ol.values():
    data.index.name = "DATE"

preds = {name: pd.DataFrame() for name in models.keys()}
for data in preds.values():
    data.index.name = "DATE"

coefs = {name: {} for name in models.keys()}

In [38]:
# Define the time window for training and testing, both set to one year (52 weeks)
window_size = 52  
test_size = 52  

# Loop through each machine learning model specified in 'models'
for name, model in tqdm(models.items()):
    
    # Loop through each stock in the 'stocks' list
    for stock in stocks:
        
        # Loop through the DataFrame 'df' in chunks, with steps of 'test_size'
        for i in range(0, len(df) - window_size - test_size + 1, test_size):
            
            # Define the indices for the training and testing sets
            train_idx = list(range(i, i + window_size))
            test_idx = list(range(i + window_size, i + window_size + test_size))
            
            # Extract feature ('indicators') and target ('stock') data
            X, y = df[indicators], df[stock]
            
            # Subset the data into training and testing sets
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            # Fit the model using the training data
            model.fit(X_train, y_train)
            
            # Generate predictions on the test set
            y_hat = model.predict(X_test)
            
            # Calculate and store the overestimation loss for each model-stock pair at each test set's last date
            ol[name].loc[X_test.index.max(), stock] = overestimation_loss(y_test, y_hat)
            
            # Calculate and store the cumulative returns for each model-stock pair at each test set's last date
            preds[name].loc[X_test.index.max(), stock] = np.cumprod(1 + y_hat)[-1]

  0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
# Creating a model results dataframe:
# Create a DataFrame to hold the median overestimation loss for each model
ol_med = pd.DataFrame({name: ols.median(axis=1) for name, ols in ol.items()})

# Identify the model with the smallest median overestimation loss for each timeframe
model_results = pd.DataFrame(ol_med.idxmin(axis=1), columns=["SELECTED_MODEL"])

# Add the corresponding smallest median overestimation loss value
model_results["MED_OVEREST_LOSS"] = ol_med.min(axis=1).values

model_results.head()

Unnamed: 0_level_0,SELECTED_MODEL,MED_OVEREST_LOSS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-01-01,KNeighborsRegressor,0.069576
1999-12-31,AdaBoostRegressor,0.073149
2000-12-29,ElasticNet,0.090349
2001-12-28,ElasticNet,0.07035
2002-12-27,KNeighborsRegressor,0.070447


In [40]:
# Defining % of stocks to pick up per end-of-regression (20%):

stock_selection_size = 0.2
n_stocks = int(len(stocks) * stock_selection_size)

n_stocks

6

In [41]:
# Selecting stocks with highest cumulative returns:

# Initialize empty dataframe
stock_selection = pd.Series(
    index=model_results.index, dtype="float64"
)

# Parse through model predictions using the selected models
for date in model_results.index:

    # Extract best model for the timeframe
    best_model = model_results.loc[date, "SELECTED_MODEL"]
    model_preds = preds[best_model]

    # Select the stocks with the highest predicted cumulative returns
    selected_stocks = model_preds.apply(
        lambda stocks: stocks.nlargest(n_stocks).index.to_list(), axis=1)
    
    # Insert selected stocks into model results
    stock_selection.loc[date] = selected_stocks.loc[date]

# Combine selected stocks with model results
model_results["SELECTED_STOCKS"] = stock_selection

model_results.head()

Unnamed: 0_level_0,SELECTED_MODEL,MED_OVEREST_LOSS,SELECTED_STOCKS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-01-01,KNeighborsRegressor,0.069576,"[INTU, STZ, JBHT, FDS, CPRT, LOW]"
1999-12-31,AdaBoostRegressor,0.073149,"[FDS, CPRT, RMD, AZO, MNST, JBHT]"
2000-12-29,ElasticNet,0.090349,"[QCOM, CPRT, ADBE, AAPL, INTU, APH]"
2001-12-28,ElasticNet,0.07035,"[NVR, UNH, AJG, RMD, BRO, POOL]"
2002-12-27,KNeighborsRegressor,0.070447,"[TYL, JBHT, NVR, ANSS, LOW, ATVI]"
