# < Creating S&P500 Components List >

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import copy

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

### df를 정의할때에는 copy()를 사용한다.

In [3]:
spDf = data[0].copy()

### head(), tail(), sample()을 사용해서 데이터 내용을 확인한다.

In [4]:
spDf.head()
spDf.tail()
spDf.sample(int(len(spDf)*.05))
spDf.sample(frac=.005)

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
136,DVA,DaVita,reports,Health Care,Health Care Facilities,"Denver, Colorado",2008-07-31,927066,1979
67,BIIB,Biogen,reports,Health Care,Biotechnology,"Cambridge, Massachusetts",2003-11-13,875045,1978
360,PH,Parker-Hannifin,reports,Industrials,Industrial Machinery,"Cleveland, Ohio",1985-11-30,76334,1917


### 데이터 구조를 확인한다.

In [5]:
spDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Symbol                 505 non-null    object
 1   Security               505 non-null    object
 2   SEC filings            505 non-null    object
 3   GICS Sector            505 non-null    object
 4   GICS Sub-Industry      505 non-null    object
 5   Headquarters Location  505 non-null    object
 6   Date first added       457 non-null    object
 7   CIK                    505 non-null    int64 
 8   Founded                505 non-null    object
dtypes: int64(1), object(8)
memory usage: 35.6+ KB


### 결측치를 확인한다.

In [6]:
spDf.isna().any()

Symbol                   False
Security                 False
SEC filings              False
GICS Sector              False
GICS Sub-Industry        False
Headquarters Location    False
Date first added          True
CIK                      False
Founded                  False
dtype: bool

In [7]:
spDf.isna().all()

Symbol                   False
Security                 False
SEC filings              False
GICS Sector              False
GICS Sub-Industry        False
Headquarters Location    False
Date first added         False
CIK                      False
Founded                  False
dtype: bool

In [8]:
print(f'NA length : {len(spDf.loc[spDf.isnull().any(axis=1)])}')
spDf.loc[spDf.isnull().any(axis=1)].sample(5)

NA length : 48


Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
324,MS,Morgan Stanley,reports,Financials,Investment Banking & Brokerage,"New York City, New York",,895421,1935
391,QCOM,Qualcomm,reports,Information Technology,Semiconductors,"San Diego, California",,804328,1985
307,MCK,McKesson Corporation,reports,Health Care,Health Care Distributors,"Irving, Texas",,927653,1833
361,PAYX,Paychex,reports,Information Technology,Data Processing & Outsourced Services,"Penfield, New York",,723531,1971
441,TXN,Texas Instruments,reports,Information Technology,Semiconductors,"Dallas, Texas",,97476,1930


### 결측치를 대체값으로 수정한다.

In [9]:
spDf.loc[:, 'Date first added'].fillna(datetime(1900,1,1).strftime('%Y-%m-%d'), inplace=True)
spDf.loc[spDf['Date first added']==datetime(1900,1,1).strftime('%Y-%m-%d')]

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
7,AMD,Advanced Micro Devices,reports,Information Technology,Semiconductors,"Santa Clara, California",1900-01-01,2488,1969
122,ED,Consolidated Edison,reports,Utilities,Electric Utilities,"New York City, New York",1900-01-01,1047862,1823
126,GLW,Corning,reports,Information Technology,Electronic Components,"Corning, New York",1900-01-01,24741,1851
134,DHR,Danaher Corporation,reports,Health Care,Health Care Equipment,"Washington, D.C.",1900-01-01,313616,1969
135,DRI,Darden Restaurants,reports,Consumer Discretionary,Restaurants,"Orlando, Florida",1900-01-01,940944,1938
150,D,Dominion Energy,reports,Utilities,Electric Utilities,"Richmond, Virginia",1900-01-01,715957,1983
160,ETN,Eaton Corporation,reports,Industrials,Electrical Components & Equipment,"Dublin, Ireland",1900-01-01,1551182,1911
177,ES,Eversource Energy,reports,Utilities,Multi-Utilities,"Hartford, Connecticut",1900-01-01,72741,1966
190,FITB,Fifth Third Bancorp,reports,Financials,Regional Banks,"Cincinnati, Ohio",1900-01-01,35527,1858
191,FE,FirstEnergy,reports,Utilities,Electric Utilities,"Akron, Ohio",1900-01-01,1031296,1997


In [10]:
spDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Symbol                 505 non-null    object
 1   Security               505 non-null    object
 2   SEC filings            505 non-null    object
 3   GICS Sector            505 non-null    object
 4   GICS Sub-Industry      505 non-null    object
 5   Headquarters Location  505 non-null    object
 6   Date first added       505 non-null    object
 7   CIK                    505 non-null    int64 
 8   Founded                505 non-null    object
dtypes: int64(1), object(8)
memory usage: 35.6+ KB


### 유일한 값을 확인한다.

In [11]:
sf = spDf.loc[:,'SEC filings'].unique().tolist()
gs = spDf.loc[:,'GICS Sector'].unique().tolist()
print(f"SEC filing : ({len(sf)}) {sf}")
print(f"GICS Sector : ({len(gs)}) {gs}")

SEC filing : (1) ['reports']
GICS Sector : (11) ['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 'Consumer Staples', 'Energy']


### 데이터 양식이 통일되어 있는지 확인한다.

In [12]:
spDf[~spDf['Date first added'].str.match('\d{4}-\d{2}-\d{2}')]

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded


In [13]:
spDf[~spDf['Headquarters Location'].str.contains(', ')]

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded


### 데이터 컬럼을 정리하고 결측치를 확인한다.

In [14]:
spAjDf=data[1].copy()
spAjDf.columns = ['date', 'added ticker', 'added security', 'removed ticker', 'removed security', 'reason']
spAjDf.loc[spAjDf.isnull().any(axis=1)]
# spAjDf.info()

Unnamed: 0,date,added ticker,added security,removed ticker,removed security,reason
1,"June 4, 2021",,,HFC,HollyFrontier,Market capitalization change.[7]
2,"June 3, 2021",OGN,Organon & Co.,,,S&P 500/100 constituent Merck & Co. spun off O...
13,"October 12, 2020",,,NBL,Noble Energy,Chevron acquired Noble Energy.[15]
14,"October 9, 2020",VNT,Vontier,,,S&P 500 constituent Fortive spun off Vontier.[15]
25,"April 6, 2020",,,M,Macy's,Market capitalization change.[21]
26,"April 6, 2020",,,RTN,Raytheon Company,United Technologies spun off Otis and Carrier ...
27,"April 3, 2020",OTIS,Otis Worldwide,,,United Technologies spun off Otis and Carrier ...
28,"April 3, 2020",CARR,Carrier Global,,,United Technologies spun off Otis and Carrier ...
121,"April 8, 2016",UA,Under Armour (Class C),,,Under Armour distribution of second class of s...
133,"December 15, 2015",,,CMCSK,Comcast K Corp,CMCSK shares no longer listed[102]


### pd.melt()를 사용해서 특정 열을 stack한다. 

In [15]:
adDf = spAjDf[['date', 'added ticker', 'added security']]
adDf = pd.melt(adDf, id_vars=['date', 'added security'], var_name='attribute', value_name='ticker')
adDf.columns = ['date', 'security', 'attribute', 'ticker']
reDf = spAjDf[['date', 'removed ticker', 'removed security']]
reDf = pd.melt(reDf, id_vars=['date', 'removed security'], var_name='attribute', value_name='ticker')
reDf.columns = ['date', 'security', 'attribute', 'ticker']
spAjDf = pd.concat([adDf, reDf], axis=0)

### 결측치가 있는 행을 제거한다.

In [16]:
spAjDf.dropna(inplace=True)
spAjDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533 entries, 0 to 273
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       533 non-null    object
 1   security   533 non-null    object
 2   attribute  533 non-null    object
 3   ticker     533 non-null    object
dtypes: object(4)
memory usage: 20.8+ KB


### 날짜 양식을 수정한다.

In [17]:
spAjDf['date'] = spAjDf['date'].apply(lambda x : pd.to_datetime(x, format='%B %d, %Y').strftime('%Y-%m-%d'))

### 데이터 순서를 변경한다.

In [18]:
spAjDf = spAjDf[['date', 'security', 'ticker', 'attribute']]

In [19]:
spDf = spDf[['Date first added', 'Security', 'Symbol']]
spDf = spDf.assign(attribute = 'added ticker')
spDf.columns = ['date', 'security', 'ticker', 'attribute']

### pd.concat()을 사용하여 데이터를 병합한다.

In [22]:
finalDf = pd.concat([spDf, spAjDf])
finalDf = finalDf.reset_index(drop=True)
finalDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       1038 non-null   object
 1   security   1038 non-null   object
 2   ticker     1038 non-null   object
 3   attribute  1038 non-null   object
dtypes: object(4)
memory usage: 32.6+ KB


### 특이한 날짜 양식을 수정한다.

In [23]:
print(finalDf.loc[finalDf['date'].str.contains("[(]"), 'date'])
finalDf.loc[finalDf['date'].str.contains(" "), 'date'] = '1983-11-30'
finalDf[finalDf['date'].str.contains(" ")]['date']

51    1983-11-30 (1957-03-04)
Name: date, dtype: object


Series([], Name: date, dtype: object)

### 날짜 양식을 datetime으로 변경한다.

In [24]:
finalDf['date'] = pd.to_datetime(finalDf['date'])
finalDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       1038 non-null   datetime64[ns]
 1   security   1038 non-null   object        
 2   ticker     1038 non-null   object        
 3   attribute  1038 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 32.6+ KB


### duplicated()를 사용하여 중복된 데이터를 제거한다.

In [25]:
finalDf.drop_duplicates(['date', 'ticker', 'attribute'], inplace=True)
finalDf.reset_index(drop=True).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       890 non-null    datetime64[ns]
 1   security   890 non-null    object        
 2   ticker     890 non-null    object        
 3   attribute  890 non-null    object        
dtypes: datetime64[ns](1), object(3)
memory usage: 27.9+ KB


### iter()를 사용하여 수상한 데이터를 확인한다.

In [26]:
dupDf=finalDf[finalDf.duplicated(['ticker'], keep=False)].copy().sort_values('date').reset_index(drop=True)
dupDf=dupDf.assign(previous=dupDf.apply(lambda x: dupDf.loc[dupDf['ticker']==x['ticker']]['attribute'].shift(1).loc[x.name], axis=1))
dupDf=dupDf.assign(dup=dupDf.apply(lambda x: 1 if dupDf.loc[dupDf['ticker']==x['ticker']]['attribute'].shift(1).loc[x.name]==x['attribute'] else 0, axis=1))
sw=dupDf[dupDf['dup']==1]['ticker'].values.tolist()
it = iter(sw)

In [27]:
dupDf[dupDf['ticker']==next(it)]

Unnamed: 0,date,security,ticker,attribute,previous,dup
16,2008-09-12,Fastenal,FAST,added ticker,,0
17,2008-09-15,Fastenal,FAST,added ticker,added ticker,1


In [64]:
tg_date = datetime(2010,5,30)
result = finalDf.loc[(finalDf['date']<tg_date)].sort_values(by='date')
finalState = result.drop_duplicates(['ticker'], keep='last')
finalResult = finalState[finalState['attribute']=='added ticker'].reset_index(drop=True)
finalResult

Unnamed: 0,date,security,ticker,attribute
0,1900-01-01,Freeport-McMoRan,FCX,added ticker
1,1900-01-01,Huntington Bancshares,HBAN,added ticker
2,1900-01-01,Eaton Corporation,ETN,added ticker
3,1900-01-01,Moody's Corporation,MCO,added ticker
4,1900-01-01,Dominion Energy,D,added ticker
...,...,...,...,...
329,2010-02-16,Berkshire Hathaway,BRK-B,added ticker
330,2010-02-26,Helmerich & Payne,HP,added ticker
331,2010-03-01,Discovery (Series A),DISCA,added ticker
332,2010-03-15,Oneok,OKE,added ticker


In [29]:
# pd.concat([finalResult, spDf], join='outer', keys='ticker' , axis=1)
checkDf = pd.merge(finalResult, spDf, on='ticker', how='outer')
checkDf[checkDf.isnull().any(axis=1)]

Unnamed: 0,date_x,security_x,ticker,attribute_x,date_y,security_y,attribute_y
267,2007-03-30,Kraft Foods,KFT,added ticker,,,
271,2007-08-24,Leucadia National,LUK,added ticker,,,
274,2007-09-27,Tesoro Corporation,TSO,added ticker,,,
277,2007-10-26,Jacobs Engineering Group,JEC,added ticker,,,
290,2008-09-16,Harris Corporation,HRS,added ticker,,,
309,2009-11-03,Priceline.com,PCLN,added ticker,,,
311,2009-12-18,SAIC,SAIC,added ticker,,,
321,2010-08-26,Tyco International,TYC,added ticker,,,
347,2012-12-21,Delphi Automotive,DLPH,added ticker,,,
361,2013-11-13,Michael Kors,KORS,added ticker,,,


In [30]:
finalDf.loc[finalDf['ticker']=='IQV']

Unnamed: 0,date,security,ticker,attribute
256,2017-08-29,IQVIA,IQV,added ticker


In [34]:
import pandas_datareader.data as pdr
import yfinance as yf

In [60]:
yf.pdr_override()

start_date = '01-06-2010'
end_date = '10-06-2010'

start = datetime.strptime(start_date, '%d-%m-%Y')
end = datetime.strptime(end_date, '%d-%m-%Y')

tickers = finalResult.ticker.values.tolist()

print(tickers[0:10])

tempDf = []
successTickers = []
failTickers = []
for ticker in tickers :
    
    try : 

        temp_data = pdr.get_data_yahoo(ticker, data_source='yahoo', start=start, end=end)

        temp_data['date'] = temp_data.index
        temp_data = temp_data.reset_index(drop=True)
        temp_data = temp_data.rename(columns={'Adj Close': 'price'})
        temp_data['ticker'] = ticker
        tempDf.append(temp_data)
        successTickers.append(ticker)
    except :
        failTickers.append(ticker)

['D', 'STT', 'SBUX', 'GE', 'ED', 'MSI', 'SRE', 'SPGI', 'ROK', 'RTX']
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- BF.B: No dat

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- DLPH: No data found, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [61]:
quaryTicker = pd.concat(tempDf).reset_index(drop=True).ticker.unique().tolist()

In [62]:
quaryDf = pd.DataFrame(quaryTicker, columns=['quary'])
quaryDf
# finalDfTickers = finalDf.ticker
# pd.merge(finalDfTickers, quarySeries, how='outer')

Unnamed: 0,quary
0,D
1,STT
2,SBUX
3,GE
4,ED
...,...
499,NXPI
500,PTC
501,CRL
502,OGN


In [63]:
len(tickers)

517