In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from snp500 import SNP500
from datetime import datetime as dt
from pprint import pprint

In [2]:
snp = SNP500()
sectors = snp.sector.copy()
sectors["GICS Sector"]= sectors["GICS Sector"].str.upper().str.replace(" ", "_")

sectors.head()

Unnamed: 0,Symbol,GICS Sector
0,MMM,INDUSTRIALS
1,AOS,INDUSTRIALS
2,ABT,HEALTH_CARE
3,ABBV,HEALTH_CARE
4,ACN,INFORMATION_TECHNOLOGY


In [3]:
sector_groups = sectors.groupby("GICS Sector")
tickers = sector_groups.sample(5, random_state=8)["Symbol"].to_list()

print(f'{", ".join(tickers[:5])}, ..., {", ".join(tickers[-5:])}')

EA, FOXA, CMCSA, MTCH, NFLX, ..., ETR, PNW, XEL, NEE, ATO


In [4]:
start = "2005-01-01"
end = dt.now().strftime("%Y-%m-%d")

prices_ = yf.download(tickers, start, end)["Adj Close"]
prices_ = prices_.dropna(axis=1)

prices_.head()

[*********************100%%**********************]  55 of 55 completed


Unnamed: 0_level_0,ACN,ATO,AZO,CBRE,CCI,CDNS,CMCSA,COF,CPB,CSX,...,PWR,RVTY,SCHW,SEE,SLB,SPG,STZ,TXN,XEL,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,18.995512,14.065578,90.75,10.983333,11.480591,13.75,8.04077,63.524788,17.212313,1.605051,...,7.928757,19.05431,8.899646,19.451067,22.085529,27.702084,20.799591,16.129154,8.957226,26.912636
2005-01-04,18.548901,14.102574,89.699997,10.92,11.298692,13.3,7.835287,62.349834,17.316589,1.570387,...,7.574617,18.821409,8.693563,18.813562,21.917099,27.398846,20.593653,15.74227,8.823017,26.729958
2005-01-05,18.476866,14.007429,90.290001,10.606667,11.578533,13.31,7.908674,61.72823,17.18334,1.56434,...,7.672988,18.260733,8.640132,18.335436,21.985849,26.185944,20.016142,15.268669,8.723604,26.590258
2005-01-06,18.311184,13.975715,89.550003,10.723333,11.648497,13.34,7.896441,62.501431,17.18334,1.583285,...,7.653314,18.441881,8.785159,18.375284,22.415524,26.419861,20.813021,15.208624,8.70869,26.928753
2005-01-07,19.168392,13.88057,89.730003,10.733333,11.928338,13.28,7.967382,62.12999,17.038504,1.565952,...,7.613965,18.433252,8.754629,18.469456,21.838032,26.53681,21.994923,15.288682,8.658986,26.75144


In [5]:
available_tickers = prices_.columns
trimmed_sectors = sectors[sectors["Symbol"].isin(available_tickers)]

sample = trimmed_sectors.groupby("GICS Sector").sample(1, random_state=8)
sampled_stocks = sample["Symbol"].to_list()
sampled_sectors = sample["GICS Sector"].to_list()

print(sampled_stocks, "\n")
pprint(sampled_sectors)

['MTCH', 'AZO', 'PEP', 'XOM', 'COF', 'RVTY', 'GE', 'CDNS', 'FCX', 'EQR', 'XEL'] 

['COMMUNICATION_SERVICES',
 'CONSUMER_DISCRETIONARY',
 'CONSUMER_STAPLES',
 'ENERGY',
 'FINANCIALS',
 'HEALTH_CARE',
 'INDUSTRIALS',
 'INFORMATION_TECHNOLOGY',
 'MATERIALS',
 'REAL_ESTATE',
 'UTILITIES']


In [6]:
prices = prices_[sampled_stocks].copy()
prices.head()

Unnamed: 0_level_0,MTCH,AZO,PEP,XOM,COF,RVTY,GE,CDNS,FCX,EQR,XEL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-01-03,6.387365,90.75,31.03343,26.912636,63.524788,19.05431,133.530838,13.75,11.550533,15.048333,8.957226
2005-01-04,6.262746,89.699997,30.812344,26.729958,62.349834,18.821409,131.92514,13.3,11.094222,14.954946,8.823017
2005-01-05,6.213877,90.290001,30.848204,26.590258,61.72823,18.260733,131.122208,13.31,10.966949,14.475262,8.723604
2005-01-06,6.196771,89.550003,31.069269,26.928753,62.501431,18.441881,132.180527,13.34,10.951435,14.475262,8.70869
2005-01-07,6.257859,89.730003,31.338142,26.75144,62.12999,18.433252,131.377762,13.28,11.056971,14.411595,8.658986


In [7]:
returns = np.log(prices).diff()
returns.columns = [f"{col}_RET" for col in returns]

features = pd.concat([prices, returns], axis=1).dropna()
features.head()

Unnamed: 0_level_0,MTCH,AZO,PEP,XOM,COF,RVTY,GE,CDNS,FCX,EQR,...,AZO_RET,PEP_RET,XOM_RET,COF_RET,RVTY_RET,GE_RET,CDNS_RET,FCX_RET,EQR_RET,XEL_RET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-04,6.262746,89.699997,30.812344,26.729958,62.349834,18.821409,131.92514,13.3,11.094222,14.954946,...,-0.011638,-0.00715,-0.006811,-0.018669,-0.012298,-0.012098,-0.033275,-0.040307,-0.006225,-0.015097
2005-01-05,6.213877,90.290001,30.848204,26.590258,61.72823,18.260733,131.122208,13.31,10.966949,14.475262,...,0.006556,0.001163,-0.00524,-0.01002,-0.030242,-0.006105,0.000752,-0.011538,-0.032601,-0.011331
2005-01-06,6.196771,89.550003,31.069269,26.928753,62.501431,18.441881,132.180527,13.34,10.951435,14.475262,...,-0.00823,0.007141,0.01265,0.012448,0.009871,0.008039,0.002251,-0.001416,0.0,-0.001711
2005-01-07,6.257859,89.730003,31.338142,26.75144,62.12999,18.433252,131.377762,13.28,11.056971,14.411595,...,0.002008,0.008617,-0.006606,-0.005961,-0.000468,-0.006092,-0.004508,0.009591,-0.004408,-0.005724
2005-01-10,6.270078,90.739998,32.120846,26.853527,62.395287,18.459126,131.049286,13.37,10.997992,14.326694,...,0.011193,0.024669,0.003809,0.004261,0.001403,-0.002503,0.006754,-0.005348,-0.005909,0.013683


In [8]:
prices.to_csv("workflow/data/prices.csv")
returns.to_csv("workflow/data/returns.csv")
features.to_csv("workflow/data/features.csv")

In [9]:
np.array([0, 1]).std()

0.5