In [1]:
####################################################################
### Determining Granger Causal Flow for S&P 500 - Andras Horvath ###
###             Data collection (Sectors) 2019-2021              ###
####################################################################

# Here, I collect the sectors for all of the S&P 500 individual stocks. 
# This will be used later.
import pandas as pd
import yfinance as yf

sp500url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
data_table = pd.read_html(sp500url)

data_table

[    Symbol            Security             GICS Sector  \
 0      MMM                  3M             Industrials   
 1      AOS         A. O. Smith             Industrials   
 2      ABT              Abbott             Health Care   
 3     ABBV              AbbVie             Health Care   
 4      ACN           Accenture  Information Technology   
 ..     ...                 ...                     ...   
 498    XYL          Xylem Inc.             Industrials   
 499    YUM         Yum! Brands  Consumer Discretionary   
 500   ZBRA  Zebra Technologies  Information Technology   
 501    ZBH       Zimmer Biomet             Health Care   
 502    ZTS              Zoetis             Health Care   
 
                                 GICS Sub-Industry    Headquarters Location  \
 0                        Industrial Conglomerates    Saint Paul, Minnesota   
 1                               Building Products     Milwaukee, Wisconsin   
 2                           Health Care Equipment  N

In [2]:
len(data_table) # There are two tables. We want the first one.

2

In [3]:
data_table[0] # We just need the Symbol to extract Tickers.

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,1524472,2011
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927


In [4]:
tickers = data_table[0]['Symbol'].tolist()

In [5]:
tickers

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BK',
 'BBWI',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'BIO',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF.B',
 'BLDR',
 'BG',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CHRW',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA'

In [6]:
type(tickers)

list

In [7]:
sectors = data_table[0]["GICS Sector"].tolist()

In [8]:
sectors

['Industrials',
 'Industrials',
 'Health Care',
 'Health Care',
 'Information Technology',
 'Information Technology',
 'Information Technology',
 'Utilities',
 'Financials',
 'Health Care',
 'Materials',
 'Consumer Discretionary',
 'Information Technology',
 'Materials',
 'Real Estate',
 'Health Care',
 'Industrials',
 'Utilities',
 'Financials',
 'Communication Services',
 'Communication Services',
 'Consumer Staples',
 'Consumer Discretionary',
 'Materials',
 'Utilities',
 'Industrials',
 'Utilities',
 'Financials',
 'Financials',
 'Real Estate',
 'Utilities',
 'Financials',
 'Industrials',
 'Health Care',
 'Information Technology',
 'Information Technology',
 'Information Technology',
 'Financials',
 'Energy',
 'Information Technology',
 'Information Technology',
 'Consumer Discretionary',
 'Financials',
 'Consumer Staples',
 'Information Technology',
 'Financials',
 'Financials',
 'Communication Services',
 'Utilities',
 'Information Technology',
 'Industrials',
 'Consumer Discreti

In [9]:
import numpy as np
#Tickers = np.array(tickers)
Tickers = pd.DataFrame(tickers)
print(Tickers)

        0
0     MMM
1     AOS
2     ABT
3    ABBV
4     ACN
..    ...
498   XYL
499   YUM
500  ZBRA
501   ZBH
502   ZTS

[503 rows x 1 columns]


In [10]:
#np.sort(Tickers)

In [10]:
#Sectors = np.array(sectors)
Sectors = pd.DataFrame(sectors)
print(Sectors)

                          0
0               Industrials
1               Industrials
2               Health Care
3               Health Care
4    Information Technology
..                      ...
498             Industrials
499  Consumer Discretionary
500  Information Technology
501             Health Care
502             Health Care

[503 rows x 1 columns]


In [11]:
Cat = pd.concat((Tickers,Sectors), axis = 1)
display(Cat)

Unnamed: 0,0,0.1
0,MMM,Industrials
1,AOS,Industrials
2,ABT,Health Care
3,ABBV,Health Care
4,ACN,Information Technology
...,...,...
498,XYL,Industrials
499,YUM,Consumer Discretionary
500,ZBRA,Information Technology
501,ZBH,Health Care


In [12]:
Cat.size

1006

In [13]:
Cat = Cat.values.reshape(-1)
print(Cat.shape)
Cat = Cat.reshape(-1,2)
print(Cat.shape)
print(Cat)

(1006,)
(503, 2)
[['MMM' 'Industrials']
 ['AOS' 'Industrials']
 ['ABT' 'Health Care']
 ...
 ['ZBRA' 'Information Technology']
 ['ZBH' 'Health Care']
 ['ZTS' 'Health Care']]


In [14]:
type(Cat)
Categories = pd.DataFrame(Cat)
#display(Categories)
#Categories.rename(columns={0: "TICKER", 1: "SECTOR"})
display(Categories)

Unnamed: 0,0,1
0,MMM,Industrials
1,AOS,Industrials
2,ABT,Health Care
3,ABBV,Health Care
4,ACN,Information Technology
...,...,...
498,XYL,Industrials
499,YUM,Consumer Discretionary
500,ZBRA,Information Technology
501,ZBH,Health Care


In [15]:
Categories.sort_values(0)

Unnamed: 0,0,1
9,A,Health Care
25,AAL,Industrials
39,AAPL,Information Technology
3,ABBV,Health Care
11,ABNB,Consumer Discretionary
...,...,...
498,XYL,Industrials
499,YUM,Consumer Discretionary
501,ZBH,Health Care
500,ZBRA,Information Technology


In [16]:
CATEGORIES = Categories.sort_values(0).transpose()

In [17]:
#tickers.index("GEHC") # 212
#tickers.index("KVUE") # 273
#tickers.index("VLTO") # 468
CATEGORIES

Unnamed: 0,9,25,39,3,11,2,42,4,5,35,...,492,496,497,190,146,498,499,501,500,502
0,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZTS
1,Health Care,Industrials,Information Technology,Health Care,Consumer Discretionary,Health Care,Financials,Information Technology,Information Technology,Information Technology,...,Real Estate,Consumer Discretionary,Utilities,Energy,Health Care,Industrials,Consumer Discretionary,Health Care,Information Technology,Health Care


In [18]:
CATEGORIES.to_csv('All S&P 500 Stocks Categories 2019-2021.csv', header = True, index = True)

In [None]:
# Manually remove:
# ABNB
# CARR
# CEG
# GEHC
# KVUE
# OTIS
# VLTO
# from the csv file. Also remove the first column, top row, and change the "." to "-" for the BF.B and BRK.B.