In [1]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Collapse

all_etfs = '''
SPY
QQQ
IWM
SCHD
ARKK
SCHX
SCHG
SPLG
RSP
IJH
IJR
VOO
SCHB
IVV
VTI
FNDX
DIA
SCHA
QQQM
CGGR
IVW
CGDV
SCHV
FMDE
GRNY
USMV
COWZ
VTV
IWD
SPSM
DYNF
SPMD
VO
SPLV
SPYV
VONV
RDVY
SPMO
SPYG
SDVY
FNDA
SCHK
DFAC
XLG
VXF
DGRO
ITOT
SPHQ
VTWO
IWR
IWS
IVOV
VONG
TCAF
SCHM
DUHP
VYM
YMAG
VOE
DFSV
MDY
DFLV
FELC
CGMM
CALF
FDVV
VFLO
CGUS
PBW
BRKC
IWF
VUG
DGRW
LRGF
IWN
JQUA
VIG
AVUV
QUAL
VB
GDXY
PVAL
SPYD
COWG
IWB
RECS
MOAT
CGCV
VLUE
JGRO
IWP
IUSV
AVLV
SPUS
FVD
DFAS
MTUM
SPHD
SNPE
ESGV
GSLC
DFUV
NOBL
VBR
IJT
FDL
DFAU
SPTM
DFAT
ACLC
SPHB
FBCG
MGK
DFUS
FELG
RWK
QVML
IVE
IUSG
DVY
ESGU
VFVA
PRF
HDV
IJS
SPYX
VBK
ILDR
BKDV
DIV
USMC
FTCS
CVLC
AFLG
PTLC
IWO
OEF
ARKX
VOOG
BSVO
FESM
BAFE
JAVA
OMFL
QDIV
THIR
ESML
BBUS
AVUS
FFLC
AFMC
TSPA
IJK
TOPT
MGV
IJJ
SLYV
SMIG
ONEQ
VOT
VV
TMFC
SLYG
SUSA
QGRW
DLN
EFIV
IWY
PEY
QDPL
FFTY
BKLC
SDY
QQQJ
DIVB
QGRO
SFLO
DFSU
BASV
FELV
ASLV
TMSL
RWL
ECML
RWJ
HLAL
JPEF
SECT
SPGP
DON
JHMM
IVOO
QLTY
MGC
AVSC
XSLV
LGDX
RPG
OAKM
FTGS
BASG
DES
FSMD
SPYT
FV
QTOP
FLQM
TCHP
MSLC
NUSC
FDLO
APUE
IPO
MDYV
DSTL
BBLU
QQWZ
CNEQ
QQQE
AUSF
THLV
PRFZ
PBUS
HCMT
VIOO
PSC
GSSC
EQWL
KOMP
PKW
DSI
USXF
SEIV
NULV
VOOV
FYX
USSG
SMLF
EQTY
XRLV
DCOR
FLCG
IWV
SEIM
FVAL
FNDB
SMMD
JXX
SYLD
ADPV
EAGL
BUZZ
PTNQ
TSME
WINN
JMOM
TGRT
FDV
FFSM
VONE
LRGG
LSGR
XSHD
OVL
SDOG
JVAL
FLQL
OUSM
MDYG
STRV
PWV
IWX
OSCV
JMEE
DGRS
RPV
TDVG
DUSA
GDE
FFLG
PTMC
GARP
BUL
MFSV
QQEW
CSMD
TVAL
REGL
MODL
FTA
SFTY
RILA
FNX
MFSG
XSHQ
CFA
OUSA
USVM
BGIG
HGRO
USSE
ALTL
EPS
FCTE
CAML
DIVZ
LCTU
QRFT
FMAG
ATFV
SEIQ
SEIS
XLSR
TBG
IWL
VIOV
GSUS
RUNN
KNGZ
NULG
JSMD
EUSA
RDIV
GSEW
CDC
FPX
LVHD
BUSA
FFOG
IUS
EZM
BLUX
SFY
LGH
NANC
DJD
PDP
VSLU
XVV
DTD
JOET
FFOX
EQAL
ELCV
VTHR
QIDX
XMAG
HFGO
ABFL
GFLW
RFG
IYY
RSMV
ESIX
PWB
CATH
QQXT
ISMD
IWC
FDRR
SAMT
XMLV
BIBL
SPD
SMOT
GRPM
OALC
QQH
EUSM
NBSM
VFMO
SIXA
SEMG
XJH
SFYF
FQAL
LRGC
FDMO
FMKT
ROE
FTC
JHML
BAMV
BAMD
IVOG
MSSM
QDF
EQL
ZECP
QVAL
SSUS
LGLV
MFUS
FLRG
CSB
DHS
BLUC
PFM
BVAL
QUS
FYC
BAMG
FCPI
USMF
GQGU
AESR
TEQI
YALL
RAFE
QMOM
VTWG
DUBS
BLGR
MBCC
VIOG
CAPE
ESN
DWAS
PGRO
VALQ
VOTE
CGGG
EES
PALC
JPSE
GRW
VTWV
AMID
MSSS
DXUV
AVMV
TPLC
BCHP
JHSC
TPHD
ASCE
TPSC
SVAL
MGMT
EQIN
SWP
TESL
SMDX
ROUS
GFGF
VUSE
CDL
SFYX
DSMC
ONEY
CWS
SMLL
SGLC
QUVU
ICAP
VFMV
RSHO
LSAF
NUMV
GOP
MDPL
FNY
SMMV
BSMC
SMDV
QQQG
CGVV
QQMG
NUMG
JSML
JGRW
DIVN
GDIV
FDIF
AVLC
TOLL
BRIF
QBIG
EDOW
JPME
GLRY
SUSL
EDGU
XDIV
ABEQ
SPXM
FEX
AAUS
LCAP
VSDA
FRTY
GGUS
GEND
QLC
GVIP
SPVU
BKMC
LOWV
OMFS
REVS
SOVF
AMOM
RFV
SMCO
LST
XJR
PTL
PY
SMIZ
AIEQ
QLV
CSM
FDIV
GSPY
STXG
ONEV
EGUS
BLCV
AVMC
GQQQ
CPAI
CHGX
SQEW
VFMF
TMFM
VMAX
QQQS
ETHO
GVLU
INVN
CLCV
FBCV
CFO
AIVL
VFQY
NXTI
SPXT
LSAT
PEXL
FDG
HCOW
MILN
MMTM
SIZE
FNK
SPDV
FFND
AFSM
FLQS
ABLD
BRNY
PRCS
PAMC
FDLS
JUST
FYT
MVPA
SIXL
TMFE
MBOX
QDEF
TSEL
IWLG
AVSU
DVOL
IQSU
INRO
TMFX
UPSD
VLU
GROZ
BFOR
RUSC
MDLV
FAD
FSCC
HDUS
LCG
EBI
GSC
PFUT
TEXN
PJFG
UDI
DIVY
WBIY
CANQ
RSMC
DEUS
PRXG
SELV
HAPI
DFVX
COWS
CCSO
ACVF
SMRI
HUSV
FLV
BKWO
FLCV
TGRW
FAB
VEGN
SCDV
FDM
GMOV
SIXS
FTIF
RJMG
FCTR
FEUS
LSVD
BDVG
LCOW
CVMC
STXV
KOOL
NBCR
NUDV
ABCS
XUDV
FSCS
SMLV
MMLG
SCAP
RNIN
LRGE
STOX
PSET
NIXT
STXM
JPSV
STXK
ABIG
KVLE
SPYC
PABU
MIDE
BCUS
SPUC
TPHE
DVAL
SPXN
RZV
FFLV
RSSL
VSMV
TPLE
HOMZ
STXD
TOPC
HSMV
INFO
TILT
SPDG
BBMC
KRMA
LOGO
MYLD
ZIG
RVER
SXQG
MID
ACTV
DUKQ
HIDV
DURA
TGLR
SHE
RZG
QQJG
TCV
LEAD
OASC
PRAE
ESG
LVDS
FOVL
CZA
FLCC
FSGS
NACP
ITAN
PLDR
DFVE
DVLU
WZRD
GURU
MAGA
QOWZ
ZZZ
PRXV
SAEF
JPUS
AGRW
OVS
BGDV
MMSC
QPX
QARP
QVMM
ALIL
FEAC
RSPE
BBSC
BKSE
FMTM
MPLY
PRVS
FTDS
PRAY
CLCG
QQQA
WLTG
IWFG
TOV
JMID
BKCG
KMID
FUNL
NSCR
FTCE
BOBP
SSPY
SPXV
USCA
OPTZ
ULVM
SQLV
TXS
SPXE
WOMN
DGLO
LGRO
QQLV
CAFG
DEMZ
QNXT
FSST
FCUS
RND
BMVP
FLOW
GGM
PVEX
DSPY
GK
RFFC
RFDA
NEWZ
SPXD
DARP
QSML
BOUT
LCDS
AFSC
NITE
BELT
CDEI
LFEQ
MOTE
DAK
LCF
NUGO
SNPG
SAMM
BEEZ
LIVR
MAVF
GVUS
AVUQ
NZUS
NULC
FDWM
BLCR
DWAW
AVIE
SNPD
SURE
TAX
BKIV
EMOT
XCOR
USRD
DVQQ
JCTR
SPYA
QMID
FCFY
CARK
OCFS
STNC
DVSP
ILCB
ILCG
ILCV
IMCB
IMCG
IMCV
ISCB
ISCG
ISCV
XMMO
XMVM
XSMO
XSVM
UPGD
XMHQ
CSD
POWA
WTV
SPVM
DDIV
DEEP
ROSC
ONEO
UDIV
USPX
INCE
ACSI
SHRY
EQRR
SPMV
XOVR
SZNE
TMFS
TMDV
DWUS
VNSE
ANEW
LOPP
HKND
XVOL
SHUS
QVMS
ACGR
KONG
NDVG
NWLG
SSPX
ESMV
THRO
CVAR
FLDZ
PWRD
LRND
HAPY
GUSA
FMCX
USNZ
DVND
LCLG
JHDV
SRHQ
XNAV
IQSM
RVRB
SNPV
PJFV
WCEO
SNAV
CVSE
EVUS
SUPP
AWEG
HAPS
USCL
PUTD
DIVL
JHAC
LQAI
BWTG
DIVG
HQGO
LGCF
SMCF
PJFM
TXSS
GRPZ
MGRO
MVAL
BGRO
EBIT
LYLD
BDIV
SAWG
SAWS
SCDS
MCDS
VLLU
CNAV
BUYO
SMAP
FMCE
BEEX
NBGX
FLCE
LQPE
PEVC
ABLS
JUSA
HWSM
FLAG
EGLE
LITL
EPMV
EPSB
EPSV
EPMB
CSTK
CCFE
AFOS
ALRG
XOEF
JDVL'''

# ALL U.S EQUITY ETFs as defined by Fidelity

etf_list = all_etfs.split()

start_date = '2015-01-01'
end_date = '2024-12-31'

etf_downloaded = yf.download(
    tickers=etf_list,start=start_date,end=end_date
    ,group_by='ticker'
    )

etf_downloaded

  etf_downloaded = yf.download(
[*********************100%***********************]  916 of 916 completed

87 Failed downloads:
['DGLO', 'HWSM', 'CHGX', 'RUSC', 'AGRW', 'PRXV', 'BASV', 'TSEL', 'LCOW', 'EPMV', 'CLCG', 'AVUQ', 'BKCG', 'EPSB', 'BLGR', 'TCV', 'CCFE', 'TOV', 'JXX', 'RSMV', 'LOGO', 'DAK', 'SEMG', 'SFTY', 'FMTM', 'SPYA', 'BVAL', 'XUDV', 'XOEF', 'DSPY', 'ALRG', 'LVDS', 'EGLE', 'TEXN', 'DIVN', 'ALIL', 'HGRO', 'BLUC', 'CGGG', 'FLAG', 'CSTK', 'AFSC', 'BOBP', 'ASCE', 'BRKC', 'LST', 'EBI', 'XDIV', 'QIDX', 'RILA', 'JUSA', 'CGMM', 'STOX', 'SPXM', 'RNIN', 'SPXD', 'EPSV', 'LGDX', 'LITL', 'EPMB', 'QQWZ', 'TOPC', 'PVEX', 'LSVD', 'MPLY', 'FMKT', 'INVN', 'FFOX', 'BASG', 'PRXG', 'AFOS', 'LCAP', 'WZRD', 'BLUX', 'AAUS', 'LQPE', 'SMDX', 'ASLV', 'CLCV', 'MAVF', 'JDVL', 'GEND', 'GQGU', 'ABIG', 'ABLS', 'PEVC', 'CGVV']: YFPricesMissingError('possibly delisted; no price data found  (1d 2015-01-01 -> 2024-12-31) (Yahoo error = "Data doesn\'t exist for startDate = 1420088400, endDate = 1735621200")')


Ticker,PRAY,PRAY,PRAY,PRAY,PRAY,QQQA,QQQA,QQQA,QQQA,QQQA,...,CGDV,CGDV,CGDV,CGDV,CGDV,DWAS,DWAS,DWAS,DWAS,DWAS
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,,,,,,,,,,,...,,,,,,37.481188,37.481188,36.384413,36.784973,302800
2015-01-05,,,,,,,,,,,...,,,,,,36.680068,36.899423,36.279507,36.403488,114700
2015-01-06,,,,,,,,,,,...,,,,,,36.403489,36.432103,35.363938,35.697739,158100
2015-01-07,,,,,,,,,,,...,,,,,,36.145992,36.556092,35.783578,36.355808,31800
2015-01-08,,,,,,,,,,,...,,,,,,36.565619,37.099704,36.565619,37.061554,122500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-23,29.780001,29.780001,28.940001,29.160999,3700.0,46.338919,46.558678,46.149129,46.558678,2600.0,...,35.174357,35.406675,35.006294,35.372074,2514800.0,92.638414,92.638414,91.179543,92.008904,44500
2024-12-24,29.252001,29.306999,29.170000,29.306999,2400.0,47.048134,47.487652,47.048134,47.487652,1200.0,...,35.441279,35.648888,35.362193,35.648888,1180700.0,92.258715,92.518509,91.399379,92.518509,22800
2024-12-26,29.410000,29.610001,29.320000,29.450001,12300.0,47.237927,47.377772,47.237927,47.317837,2500.0,...,35.534608,35.763162,35.504798,35.673729,1726500.0,92.568470,93.787529,92.028886,93.787529,39800
2024-12-27,29.045000,29.238001,29.045000,29.238001,2100.0,46.732484,46.734480,46.408839,46.698521,3100.0,...,35.534608,35.584296,35.251405,35.445175,2453900.0,93.287921,93.287921,90.939737,91.978935,32900


In [None]:
# Checking the amount of tickers that made it through to the dataset
ticker_list = etf_downloaded.columns.get_level_values(0).unique().tolist()
print(len(ticker_list)) # 916 columns

# ad_close stores close prices
adj_close = etf_downloaded.xs('Close', level=1, axis=1)
print(len(adj_close.columns)) # still 916 columns

# dropping all tickers that have a null first value
# this means that all tickers that don't exist at the start date will be dropped
# and all tickers that failed to download will be dropped
adj_close = adj_close.dropna(axis=1, subset=[adj_close.index[0]])

print(len(adj_close.columns)) # 209 columns remain 


916
916
209


In [None]:
# Double-checking all tickers are recognized by yfinance

tickers = adj_close.columns

def split_supported(tickers):
    ok, bad = [], []
    for t in tickers:
        try:
            # cheap existence check: last day of data if Yahoo knows the symbol
            if not yf.Ticker(t).history(period="1d", auto_adjust=False).empty:
                ok.append(t)
            else:
                bad.append(t)
        except Exception:
            bad.append(t)
    return ok, bad

ok_tickers, bad_tickers = split_supported(tickers)
print(f"Supported on Yahoo: {len(ok_tickers)} | Not found: {len(bad_tickers)}")

# Results:
# 1 ticker not found - CSD, Rest of the 208 are yfinance supported
# Upon program re-run, csd has been found

# Previous results with 87 failed to download: 10 tickers were not recgonized by Yahoo Finance (ABLS,ALRG,CSTK,EGLE,EPMB,EPMV,EPSB,EPSV,LQPE,PEVC), 77 tickers were recognized by Yahoo Finance


Supported on Yahoo: 209 | Not found: 0


In [59]:
# More cleaning and data inspection 

# adj_close = adj_close.drop(columns="CSD")
# tickers = adj_close.columns
# print(len(tickers))

new_etf = etf_downloaded[tickers]

print(adj_close.isna().sum().sum())

for field in ['Open','Close','High','Low','Volume']:
    sub = new_etf.xs(field, level=1, axis=1)
    print(f"\n--- {field} ---")
    print("Shape:", sub.shape)
    print("Total NaNs:", sub.isna().sum().sum())
    print("Tickers with NaNs:", (sub.isna().sum() > 0).sum())

# There are no null values anywhere


0

--- Open ---
Shape: (2515, 209)
Total NaNs: 0
Tickers with NaNs: 0

--- Close ---
Shape: (2515, 209)
Total NaNs: 0
Tickers with NaNs: 0

--- High ---
Shape: (2515, 209)
Total NaNs: 0
Tickers with NaNs: 0

--- Low ---
Shape: (2515, 209)
Total NaNs: 0
Tickers with NaNs: 0

--- Volume ---
Shape: (2515, 209)
Total NaNs: 0
Tickers with NaNs: 0


In [None]:
# Get all Equity ETFs downloaded from yfinance



# What cap sizes do I want to be looking at? All of them? 
# All caps: Large, Mega, Mid, Small, Micro, Multi
# Using: Large, Mega, Mid, Small, Multi

# Isolate liquidity as an aspect in all ETFs
# Filter to only allow liquidity of $X+ per day 
