In [1]:
####################################################################
### Determining Granger Causal Flow for S&P 500 - Andras Horvath ###
###                 Data collection 2019-2021                    ###
####################################################################

# Here, I collect all stock ticker data for S&P 500 individual stocks. 
# This will then be saved as a csv file and opened in R for analysis.
# The following link was instrumental in helping me understand how to
# do this in Python:
# https://www.youtube.com/watch?v=Zt2PuC7iXEU
# pip install yfinance

In [1]:
import pandas as pd
import yfinance as yf

In [2]:
sp500url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
data_table = pd.read_html(sp500url)
# This will get us the stocks which make up the S&P 500 as of the present date. The current date that this was done for is 3/31/2024.

In [3]:
data_table

[    Symbol            Security             GICS Sector  \
 0      MMM                  3M             Industrials   
 1      AOS         A. O. Smith             Industrials   
 2      ABT              Abbott             Health Care   
 3     ABBV              AbbVie             Health Care   
 4      ACN           Accenture  Information Technology   
 ..     ...                 ...                     ...   
 498    XYL          Xylem Inc.             Industrials   
 499    YUM         Yum! Brands  Consumer Discretionary   
 500   ZBRA  Zebra Technologies  Information Technology   
 501    ZBH       Zimmer Biomet             Health Care   
 502    ZTS              Zoetis             Health Care   
 
                                 GICS Sub-Industry    Headquarters Location  \
 0                        Industrial Conglomerates    Saint Paul, Minnesota   
 1                               Building Products     Milwaukee, Wisconsin   
 2                           Health Care Equipment  N

In [4]:
len(data_table) # There are two tables. We want the first one.

2

In [5]:
data_table[0] # We just need the Symbol to extract Tickers.

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,1524472,2011
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927


In [6]:
tickers = data_table[0]['Symbol'].tolist()

In [7]:
tickers

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BK',
 'BBWI',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'BIO',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF.B',
 'BLDR',
 'BG',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CHRW',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA'

In [8]:
len(tickers) #503 stocks

503

In [9]:
yf.download('MMM', start = "2022-01-19", end = "2024-03-28")['Adj Close'] # This is just a test

[*********************100%%**********************]  1 of 1 completed


Date
2022-01-19    158.821091
2022-01-20    154.451920
2022-01-21    153.633286
2022-01-24    153.766754
2022-01-25    154.612091
                 ...    
2024-03-21    107.870003
2024-03-22    106.779999
2024-03-25    104.839996
2024-03-26    102.629997
2024-03-27    104.589996
Name: Adj Close, Length: 550, dtype: float64

In [11]:
# There is not one single choice for the cutoffs for the data but we will have to start sometime in Summer 2019. 
# Earliest day for summer solstice is June 20. So we will start on June 20 and go 2 years into the future and stop on June 19.
snp_prices = yf.download(tickers,start = "2019-06-20", end = "2021-06-19")['Adj Close']

[*********************100%%**********************]  503 of 503 completed

6 Failed downloads:
['VLTO', 'GEHC', 'CEG', 'KVUE']: Exception("%ticker%: Data doesn't exist for startDate = 1561003200, endDate = 1624075200")
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2019-06-20 -> 2021-06-19)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


In [12]:
snp_prices

Ticker,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-20,70.888229,31.816221,48.189846,63.015671,,78.410408,36.419998,174.702164,302.109985,102.974174,...,21.926628,118.062202,53.386971,60.108280,54.283623,78.647858,101.477798,111.916092,201.119995,109.347824
2019-06-21,70.907562,31.242243,48.025558,63.369598,,78.465820,36.439999,173.802994,299.329987,103.989677,...,21.588152,117.146248,53.587254,60.955654,55.654186,77.193863,101.156731,110.588547,200.110001,109.260712
2019-06-24,70.714195,31.489647,47.977242,63.104156,,78.198097,36.180000,174.318192,299.869995,103.285225,...,21.323977,117.647606,53.352131,60.375050,54.510468,77.514862,101.514503,109.775162,200.039993,109.831810
2019-06-25,70.105103,30.935461,47.250031,52.848217,,77.764221,35.910000,173.184830,287.970001,100.888359,...,21.101080,114.697258,53.108322,59.841515,54.718422,77.005035,101.193428,109.102058,197.529999,110.209305
2019-06-26,69.747360,31.133385,48.271992,54.698318,,76.582626,35.270000,171.695557,288.720001,101.428116,...,21.010271,116.914848,51.767338,60.100441,54.453758,76.778427,101.019142,107.110718,203.399994,109.425270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-14,141.860779,22.990000,128.461395,103.330650,149.210007,105.205254,39.139999,274.852844,556.950012,160.167252,...,30.026915,123.683609,63.417900,55.598953,62.130859,113.460617,112.262230,149.504395,509.920013,180.723587
2021-06-15,142.086761,22.790001,127.634369,103.715675,151.779999,105.138588,39.529999,275.314667,548.460022,159.387512,...,30.148661,124.129059,63.518879,57.623341,63.302952,114.253990,112.747177,150.254028,506.100006,181.790817
2021-06-16,141.074783,22.830000,128.136444,103.447052,149.149994,104.805290,39.389999,273.150177,543.330017,157.866135,...,29.896481,124.178535,62.206100,57.417316,62.947495,111.738449,112.338303,149.959854,506.700012,180.263412
2021-06-17,144.346451,22.230000,129.751099,102.882942,150.699997,106.300331,38.509998,273.948578,551.359985,158.056290,...,29.600813,124.030060,62.747742,55.527287,62.553596,110.238792,111.102142,152.199356,502.089996,182.544739


In [13]:
# We had two errors. Let's take care of them.
snp_prices.isna().any()

Ticker
A       False
AAL     False
AAPL    False
ABBV    False
ABNB     True
        ...  
XYL     False
YUM     False
ZBH     False
ZBRA    False
ZTS     False
Length: 503, dtype: bool

In [15]:
# Print columns with null values
snp_prices.loc[:,snp_prices.isna().any()]
# Look back at website where tickers came from.
# ABNB added on 2023-09-18
# BF.B added on 1982-10-31
# BRK.B added on 2010-02-16
# CARR added on 2020-04-03
# CEG added on 2022-02-02
# GEHC added on 2023-01-04
# KVUE added on 2023-08-25
# OTIS added on 2020-04-03
# VLTO added on 2023-10-02

# All of these are new editions to the S&P except for BF.B and BRK.B. We should be able to get data for those guys.

Ticker,ABNB,BF.B,BRK.B,CARR,CEG,GEHC,KVUE,OTIS,VLTO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-06-20,,,,,,,,,
2019-06-21,,,,,,,,,
2019-06-24,,,,,,,,,
2019-06-25,,,,,,,,,
2019-06-26,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
2021-06-14,149.210007,,,44.400784,,,,77.696190,
2021-06-15,151.779999,,,44.708256,,,,77.878876,
2021-06-16,149.149994,,,45.073380,,,,77.532707,
2021-06-17,150.699997,,,44.323914,,,,77.378853,


In [16]:
# Remove the afformentioned stocks.
snp_prices = snp_prices.drop('ABNB', axis=1)
snp_prices = snp_prices.drop('CARR', axis=1)
snp_prices = snp_prices.drop('CEG', axis=1)
snp_prices = snp_prices.drop('GEHC', axis=1)
snp_prices = snp_prices.drop('KVUE', axis=1)
snp_prices = snp_prices.drop('OTIS', axis=1)
snp_prices = snp_prices.drop('VLTO', axis=1)
print(snp_prices)

Ticker               A        AAL        AAPL        ABBV         ABT  \
Date                                                                    
2019-06-20   70.888229  31.816221   48.189846   63.015671   78.410408   
2019-06-21   70.907562  31.242243   48.025558   63.369598   78.465820   
2019-06-24   70.714195  31.489647   47.977242   63.104156   78.198097   
2019-06-25   70.105103  30.935461   47.250031   52.848217   77.764221   
2019-06-26   69.747360  31.133385   48.271992   54.698318   76.582626   
...                ...        ...         ...         ...         ...   
2021-06-14  141.860779  22.990000  128.461395  103.330650  105.205254   
2021-06-15  142.086761  22.790001  127.634369  103.715675  105.138588   
2021-06-16  141.074783  22.830000  128.136444  103.447052  104.805290   
2021-06-17  144.346451  22.230000  129.751099  102.882942  106.300331   
2021-06-18  142.469955  22.290001  128.441681  101.289108  105.062424   

Ticker           ACGL         ACN        ADBE     

In [17]:
snp_prices.loc[:,snp_prices.isna().any()]

Ticker,BF.B,BRK.B
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-06-20,,
2019-06-21,,
2019-06-24,,
2019-06-25,,
2019-06-26,,
...,...,...
2021-06-14,,
2021-06-15,,
2021-06-16,,
2021-06-17,,


In [18]:
yf.download('BF-B')["Adj Close"] # There is clearly data for this stock.

[*********************100%%**********************]  1 of 1 completed


Date
1980-03-17     0.195423
1980-03-18     0.195423
1980-03-19     0.198038
1980-03-20     0.198691
1980-03-21     0.196077
                ...    
2024-03-22    50.900002
2024-03-25    50.860001
2024-03-26    51.000000
2024-03-27    52.040001
2024-03-28    51.619999
Name: Adj Close, Length: 11102, dtype: float64

In [19]:
yf.download('BRK-B')["Adj Close"] # There is clearly data for this stock.

[*********************100%%**********************]  1 of 1 completed


Date
1996-05-09     23.200001
1996-05-10     24.000000
1996-05-13     23.900000
1996-05-14     23.600000
1996-05-15     23.200001
                 ...    
2024-03-22    411.600006
2024-03-25    409.920013
2024-03-26    411.570007
2024-03-27    416.929993
2024-03-28    420.519989
Name: Adj Close, Length: 7019, dtype: float64

In [20]:
#len(tickers) # 503. It should be 496.
tickers.remove("ABNB")
tickers.remove("CARR")
tickers.remove("CEG")
tickers.remove("GEHC")
tickers.remove("KVUE")
tickers.remove("OTIS")
tickers.remove("VLTO")
len(tickers) # 496

496

In [21]:
for i in range(len(tickers)):
    if tickers[i] == 'BRK.B':
        tickers[i] = 'BRK-B'
    elif tickers[i] == 'BF.B':
        tickers[i] = 'BF-B'

In [22]:
'BF-B' in tickers

True

In [23]:
'BRK-B' in tickers # Yep! Both are in the list!

True

In [24]:
snp_prices = yf.download(tickers,start = "2019-06-20", end = "2021-06-19")['Adj Close']

[*********************100%%**********************]  496 of 496 completed


In [25]:
snp_prices

Ticker,A,AAL,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-20,70.888237,31.816221,48.189850,63.015667,78.410423,36.419998,174.702179,302.109985,102.974197,36.416424,...,21.926632,118.062202,53.386971,60.108288,54.283619,78.647858,101.477814,111.916077,201.119995,109.347832
2019-06-21,70.907555,31.242243,48.025558,63.369602,78.465790,36.439999,173.803024,299.329987,103.989662,36.222450,...,21.588156,117.146240,53.587246,60.955650,55.654179,77.193855,101.156723,110.588539,200.110001,109.260704
2019-06-24,70.714195,31.489649,47.977234,63.104141,78.198097,36.180000,174.318161,299.869995,103.285225,35.913830,...,21.323977,117.647606,53.352135,60.375053,54.510471,77.514862,101.514496,109.775169,200.039993,109.831795
2019-06-25,70.105087,30.935461,47.250023,52.848217,77.764221,35.910000,173.184799,287.970001,100.888336,35.640480,...,21.101080,114.697258,53.108334,59.841530,54.718426,77.005020,101.193413,109.102066,197.529999,110.209297
2019-06-26,69.747368,31.133385,48.271984,54.698303,76.582626,35.270000,171.695541,288.720001,101.428116,35.711018,...,21.010267,116.914856,51.767338,60.100433,54.453758,76.778427,101.019142,107.110733,203.399994,109.425262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-14,141.860794,22.990000,128.461380,103.330643,105.205246,39.139999,274.852875,556.950012,160.167252,61.264683,...,30.026915,123.683601,63.417896,55.598942,62.130867,113.460617,112.262238,149.504379,509.920013,180.723572
2021-06-15,142.086761,22.790001,127.634369,103.715675,105.138588,39.529999,275.314667,548.460022,159.387543,60.719860,...,30.148653,124.129044,63.518887,57.623341,63.302956,114.253983,112.747192,150.254028,506.100006,181.790802
2021-06-16,141.074814,22.830000,128.136459,103.447037,104.805283,39.389999,273.150177,543.330017,157.866119,60.109276,...,29.896481,124.178543,62.206093,57.417316,62.947491,111.738449,112.338295,149.959854,506.700012,180.263382
2021-06-17,144.346512,22.230000,129.751099,102.882919,106.300339,38.509998,273.948578,551.359985,158.056305,58.174210,...,29.600822,124.030067,62.747734,55.527294,62.553581,110.238800,111.102142,152.199341,502.089996,182.544754


In [26]:
snp_prices[['BRK-B', 'BF-B']]

Ticker,BRK-B,BF-B
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-06-20,208.039993,51.822208
2019-06-21,206.210007,51.532333
2019-06-24,206.850006,51.672592
2019-06-25,208.070007,52.093395
2019-06-26,208.509995,51.345322
...,...,...
2021-06-14,283.529999,72.235756
2021-06-15,283.940002,72.359795
2021-06-16,283.109985,71.749168
2021-06-17,278.690002,71.348450


In [27]:
len(snp_prices.columns) # 496

496

In [29]:
snp_prices.loc[:,snp_prices.isna().any()] # All is well! No output!

Ticker
Date
2019-06-20
2019-06-21
2019-06-24
2019-06-25
2019-06-26
...
2021-06-14
2021-06-15
2021-06-16
2021-06-17


In [30]:
snp_prices.to_csv('All S&P 500 Stocks 2019-2021.csv', header = True, index = True)