In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import capstone.portfolio.optimize as opt
from capstone.portfolio.prune import prune_allocations
from capstone.portfolio.backtest import backtest_portfolios
from capstone.portfolio.metrics import Metrics

from capstone.utils import read_file, get_sectors
from tqdm.auto import tqdm

sns.set_style("whitegrid")
plt.rcParams["lines.linewidth"] = 1
plt.rcParams["axes.edgecolor"] = "k"

In [11]:
sectors = get_sectors()

stocks_by_sector = read_file("stocks_by_sector", index_col=0)
snp_log_returns = read_file("snp_log_returns", index_col=0)

df = read_file("master_df", index_col=0)

sarimax_ouls = read_file("sarimax_mean_ouls", index_col=0)
sklearn_ouls = read_file("sklearn_mean_ouls", index_col=0)

sarimax_best_sectors = read_file("sarimax_best_sectors", index_col=0)
sklearn_best_sectors = read_file("sklearn_best_sectors", index_col=0)

In [12]:
best_sectors_by_model = pd.concat([sarimax_best_sectors, sklearn_best_sectors], axis=1)
best_sectors_by_model = best_sectors_by_model.dropna()

best_sectors_by_model.head()

Unnamed: 0,SARIMAX,ElasticNet,SVR,RandomForest,GradientBoost,XGBoost
2007-11-20,ENERGY,ENERGY,INDUSTRIALS,UTILITIES,UTILITIES,UTILITIES
2008-05-22,UTILITIES,ENERGY,CONSUMER_DISCRETIONARY,ENERGY,UTILITIES,COMMUNICATION_SERVICES
2008-11-19,CONSUMER_DISCRETIONARY,CONSUMER_STAPLES,UTILITIES,REAL_ESTATE,REAL_ESTATE,REAL_ESTATE
2009-05-22,REAL_ESTATE,CONSUMER_DISCRETIONARY,UTILITIES,REAL_ESTATE,REAL_ESTATE,CONSUMER_DISCRETIONARY
2009-11-19,COMMUNICATION_SERVICES,REAL_ESTATE,INDUSTRIALS,INFORMATION_TECHNOLOGY,COMMUNICATION_SERVICES,CONSUMER_DISCRETIONARY


In [13]:
mean_ouls = pd.concat([sarimax_ouls, sklearn_ouls], axis=1)
mean_ouls = mean_ouls.dropna()

mean_ouls.head()

Unnamed: 0,SARIMAX,ElasticNet,SVR,RandomForest,GradientBoost,XGBoost
2007-11-20,0.08669,0.096248,0.104939,0.09423,0.099368,0.09944
2008-05-22,0.121866,0.116415,0.115444,0.117315,0.123011,0.12189
2008-11-19,0.120516,0.133326,0.131311,0.131475,0.13659,0.141665
2009-05-22,0.142409,0.143372,0.200228,0.142021,0.133019,0.13563
2009-11-19,0.112528,0.095592,0.115998,0.098525,0.11182,0.107911


In [14]:
best_models = mean_ouls.idxmin(axis=1)
best_models

2007-11-20          SARIMAX
2008-05-22              SVR
2008-11-19          SARIMAX
2009-05-22    GradientBoost
2009-11-19       ElasticNet
2010-05-24       ElasticNet
2010-11-19       ElasticNet
2011-05-23          XGBoost
2011-11-18          SARIMAX
2012-05-22              SVR
2012-11-21       ElasticNet
2013-05-24              SVR
2013-11-21       ElasticNet
2014-05-27     RandomForest
2014-11-21       ElasticNet
2015-05-27          XGBoost
2015-11-23    GradientBoost
2016-05-25          SARIMAX
2016-11-22       ElasticNet
2017-05-25       ElasticNet
2017-11-22       ElasticNet
2018-05-25          XGBoost
2018-11-23     RandomForest
2019-05-29              SVR
2019-11-25     RandomForest
2020-05-28       ElasticNet
2020-11-24       ElasticNet
2021-05-27    GradientBoost
2021-11-24       ElasticNet
2022-05-26       ElasticNet
2022-11-25          SARIMAX
2023-05-30    GradientBoost
dtype: object

In [18]:
forecast = 126

# Create an empty DataFrame
results = pd.DataFrame(
    columns=["SELECTED_MODEL", "SELECTED_SECTOR", "AVAILABLE_STOCKS"],
    index=best_models.index
)

# Loop through dates in the best_models index
for date in best_models.index:

    # Get the best model for the current date
    best_model = best_models.loc[date]

    # Predict the selected sector using the best model
    selected_sector = best_sectors_by_model.loc[date, best_model]

    # Store the selected model and sector in the results DataFrame
    results.loc[date, "SELECTED_MODEL"] = best_model
    results.loc[date, "SELECTED_SECTOR"] = selected_sector

    # Retrieve the selected sector's constituents from stocks_by_sector DataFrame
    sector_constituents = stocks_by_sector[stocks_by_sector["GICS Sector"] == selected_sector]["Symbol"]

    # Filter out constituents that are not in snp_log_returns columns
    sector_constituents = [c for c in sector_constituents if c in snp_log_returns.columns]

    # Store the available stocks
    results.loc[date, "AVAILABLE_STOCKS"] = sector_constituents

In [20]:
results.head()

Unnamed: 0,SELECTED_MODEL,SELECTED_SECTOR,AVAILABLE_STOCKS
2007-11-20,SARIMAX,ENERGY,"[APA, BKR, CVX, COP, CTRA, DVN, EOG, EQT, XOM,..."
2008-05-22,SVR,CONSUMER_DISCRETIONARY,"[AMZN, AZO, BBWI, BBY, BKNG, BWA, KMX, CCL, CM..."
2008-11-19,SARIMAX,CONSUMER_DISCRETIONARY,"[AMZN, AZO, BBWI, BBY, BKNG, BWA, KMX, CCL, CM..."
2009-05-22,GradientBoost,REAL_ESTATE,"[ARE, AMT, AVB, BXP, CPT, CBRE, CCI, DLR, EQIX..."
2009-11-19,ElasticNet,REAL_ESTATE,"[ARE, AMT, AVB, BXP, CPT, CBRE, CCI, DLR, EQIX..."


In [21]:
results.to_csv("data/best_models_and_sectors.csv")