# Index Data

Sheet designed to aggregate all the useful data and store it in a database.

In [5]:
import pandas as pd
import hvplot.pandas
import sqlalchemy
from dotenv import load_dotenv
from pathlib import Path
import requests
import json
import os
import time

load_dotenv('data/.env')
AVAPI = os.getenv('Alpha_Vantage_API')

database_connection_string = 'sqlite:///SP500.db'
engine = sqlalchemy.create_engine(database_connection_string)

Load DataFrames from the Databases that were constructed below

In [6]:
SP500_CSSO = pd.read_sql_table('SP500 CSSO', engine)
SP500_CSSO.head()

Unnamed: 0,index,AAPL,GOOGL,GOOG,MSFT,AMZN,FB,JPM,JNJ,XOM,...,FLS,HRB,UA,PWR,AIZ,NAVI,RRC,PDCO,SIG,CHK
0,2013-06-30,,,,,,,,,,...,,,,,,,,,,
1,2013-09-30,,,,,,,,,,...,,,,,,,,,,
2,2013-12-31,,,,,,,,,,...,,,,,,,,,,
3,2014-03-31,,,,,,,,,,...,,,,,,,,,,
4,2014-06-30,,,,,,,,,,...,,,,,,,,,,


In [7]:
SP500_Cons_Adj = pd.read_sql_table('SP500 Cons Adjusted', engine)
SP500_Cons_Adj.head()

Unnamed: 0,index,AAPL,GOOGL,GOOG,MSFT,AMZN,FB,JPM,JNJ,XOM,...,UA,PWR,CSRA,AIZ,EVHC,NAVI,RRC,PDCO,SIG,CHK
0,1999-11-01,0.593355,,,29.221068,69.13,,28.189088,30.008308,18.84203,...,,18.060058,,,,,2.425223,7.964985,10.762833,
1,1999-11-02,0.61346,,,29.281175,66.44,,28.232943,29.265952,18.683228,...,,18.020568,,,,,2.581689,7.975846,10.762833,
2,1999-11-03,0.623015,,,29.10402,65.81,,27.811254,29.779891,18.400912,...,,18.514192,,,,,2.425223,7.988518,10.762833,
3,1999-11-04,0.639221,,,29.024933,63.06,,28.378004,30.065412,18.400912,...,,20.982313,,,,,2.38642,7.445451,10.973869,
4,1999-11-05,0.675073,,,28.964827,64.94,,29.096563,30.122517,17.800992,...,,20.857261,,,,,2.308187,7.331407,10.813481,


In [8]:
SP500_Cons_Close = pd.read_sql_table('SP500 Cons Closing', engine)
SP500_Cons_Close.head()

Unnamed: 0,index,AAPL,GOOGL,GOOG,MSFT,AMZN,FB,JPM,JNJ,XOM,...,UA,PWR,CSRA,AIZ,EVHC,NAVI,RRC,PDCO,SIG,CHK
0,1999-11-01,77.62,,,92.37,69.13,,83.56,105.1,74.75,...,,27.44,,,,,3.875,44.0,25.5,
1,1999-11-02,80.25,,,92.56,66.44,,83.69,102.5,74.12,...,,27.38,,,,,4.125,44.06,25.5,
2,1999-11-03,81.5,,,92.0,65.81,,82.44,104.3,73.0,...,,28.13,,,,,3.875,44.13,25.5,
3,1999-11-04,83.62,,,91.75,63.06,,84.12,105.3,73.0,...,,31.88,,,,,3.813,41.13,26.0,
4,1999-11-05,88.31,,,91.56,64.94,,86.25,105.5,70.62,...,,31.69,,,,,3.688,40.5,25.62,


### Import SP500  Closing Data

from https://www.spglobal.com/spdji/en/indices/equity/sp-500/#overview

In [10]:
SP500_Data = pd.read_csv(Path("Data/SP500to2012.csv"), index_col='Date', parse_dates=True, infer_datetime_format=True)
SP500_Data = SP500_Data.astype({'Close':'float64'})
SP500_Data.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2012-01-31,1312.41
2012-02-01,1324.09
2012-02-02,1325.54
2012-02-03,1344.9
2012-02-06,1344.33


### Import SP500 Changes

from: https://analyzingalpha.com/sp500-historical-components-and-changes

In [11]:
SP500_Changes = pd.read_csv(Path("Data/SP500_changes.csv"), index_col='date', parse_dates=True, infer_datetime_format=True)
SP500_Changes.head()

Unnamed: 0_level_0,Unnamed: 0,ticker,name,action,cik
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-09-08,112,CHTR,Charter Communications,added,
2016-09-08,112,EMC,EMC Corporation,removed,
2016-09-06,113,MTD,Mettler Toledo,added,
2016-09-06,113,TYC,Tyco International,removed,
2012-09-05,208,LYB,LyondellBasell,added,


### Import ETF AUM DATA

In [12]:
ETF_Data = pd.read_csv(Path("Data/ETFAUM.csv"), index_col='Date', parse_dates=True, infer_datetime_format=True)
ETF_Data = ETF_Data.astype({'SPY': 'float64', 'IVV': 'float64', 'VOO': 'float64', 'SP500': 'float64'})
ETF_Data

Unnamed: 0_level_0,SPY,IVV,VOO,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-12-31,455.22,334.71,279.84,40360.0
2020-12-31,329.02,238.85,176.98,31660.0
2019-12-31,307.39,201.41,130.18,26760.0
2018-12-31,243.39,149.37,89.4,21030.0
2017-12-29,277.54,141.49,83.68,22820.0
2016-12-30,224.82,90.6,56.49,19270.0
2015-12-31,182.04,70.35,40.62,17960.0
2014-12-31,215.91,69.69,27.76,18250.0
2013-12-31,174.85,53.7,14.82,16490.0
2012-12-31,123.0,34.91,6.45,12740.0


### Import Constituent Info
Primarily used for the Tickers, everything else is out of date.

From: https://datahub.io/core/s-and-p-500-companies-financials#resource-constituents-financials

In [13]:
SP500_Constituents_details = pd.read_csv(Path("Data/constituents_financials.csv"), index_col='Symbol')
SP500_Constituents_details = SP500_Constituents_details.sort_values(by=['Market Cap'], ascending=False)
SP500_Constituents_details.head()

Unnamed: 0_level_0,Name,Sector,Price,Price/Earnings,Dividend Yield,Earnings/Share,52 Week Low,52 Week High,Market Cap,EBITDA,Price/Sales,Price/Book,SEC Filings
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAPL,Apple Inc.,Information Technology,155.15,16.86,1.579541,9.2,180.1,131.12,809508000000.0,79386000000.0,3.458609,5.66,http://www.sec.gov/cgi-bin/browse-edgar?action...
GOOGL,Alphabet Inc Class A,Information Technology,1007.71,31.48,0.0,22.27,1198.0,824.3,733824000000.0,34217000000.0,6.801692,4.7,http://www.sec.gov/cgi-bin/browse-edgar?action...
GOOG,Alphabet Inc Class C,Information Technology,1001.52,40.29,0.0,22.27,1186.89,803.1903,728536000000.0,32714000000.0,6.772653,4.67,http://www.sec.gov/cgi-bin/browse-edgar?action...
MSFT,Microsoft Corp.,Information Technology,85.01,25.76,1.874791,2.97,96.07,63.22,689978000000.0,41079000000.0,7.113097,9.49,http://www.sec.gov/cgi-bin/browse-edgar?action...
AMZN,Amazon.com Inc,Consumer Discretionary,1350.5,296.16,0.0,6.16,1498.0,812.5,685873000000.0,16132000000.0,3.927053,24.28,http://www.sec.gov/cgi-bin/browse-edgar?action...


In [16]:
SP500_List = SP500_Constituents_details.index.tolist()
# COMMENTED OUT because its long
#SP500_List

In [17]:
pd.DataFrame(SP500_List).to_sql('SP500_Cons', engine)

### Import an Individual Stock Ticker

from: Alpha Advantage API https://www.alphavantage.co/documentation/#dailyadj

In [49]:
# This is for Non-Premium
# url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=IBM&outputsize=full&apikey={AVAPI}'
# r = requests.get(url)
# data = r.json()
# COMMENTED OUT because its really long
# print(json.dumps(data, indent=4, sort_keys=True))

# With Premium
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=IBM&outputsize=full&apikey={AVAPI}'
r = requests.get(url)
data = r.json()
# COMMENTED OUT because its really long
# print(json.dumps(data, indent=4, sort_keys=True))

In [50]:
df=pd.DataFrame.from_dict(data['Time Series (Daily)'], orient="index")
df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient
2022-02-09,137.84,138.35,136.83,137.79,137.79,5387672,0.0,1.0
2022-02-08,137.23,137.52,135.78,137.02,137.02,4181825,0.0,1.0
2022-02-07,137.45,137.8223,136.27,137.24,137.24,3759046,0.0,1.0
2022-02-04,137.86,138.82,136.215,137.15,137.15,4142045,0.0,1.0
2022-02-03,137.0,138.76,135.831,137.78,137.78,6100777,0.0,1.0


### Aggregate Constituent DF

Create the table, with AAPL, then append the rest of the table with a for loop

from: Alpha Advantage API https://www.alphavantage.co/documentation/#dailyadj

In [70]:
## PULLING ADJUSTED PRICE

def BuildStockColumns(stock):
    # API call and turn into a dataframe
    url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={stock}&outputsize=full&apikey={AVAPI}'
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame.from_dict(data['Time Series (Daily)'], orient="index")
    
    # Rename columns and delete unused data, saving dividends in case used in the future
    df.rename(columns={'4. close':'close', '5. adjusted close':'adjusted','7. dividend amount':'dividend','8. split coefficient':'split'}, inplace=True)
    df.drop(columns={'1. open','2. high','3. low','6. volume'}, inplace=True)
    
    # Turn strings into floats
    df = df.astype({'close': 'float64', 'adjusted': 'float64', 'dividend': 'float64', 'split': 'float64'})
    return df

def BuildDBColumnADJ(stock):
    dbdf = BuildStockColumns(stock)
    dbdf.rename(columns={'adjusted': stock}, inplace=True)
    dbdf.drop(columns={'close','dividend','split'}, inplace=True)    
    return dbdf

def BuildDBColumnCLOSE(stock):
    dbdf = BuildStockColumns(stock)
    dbdf.rename(columns={'close':stock}, inplace=True)
    dbdf.drop(columns={'adjusted','dividend','split'}, inplace=True)    
    return dbdf
    

AAPL_df= BuildDBColumnCLOSE('AAPL')

In [71]:
AAPL_df

Unnamed: 0,AAPL
2022-02-09,176.28
2022-02-08,174.83
2022-02-07,171.66
2022-02-04,172.39
2022-02-03,172.90
...,...
1999-11-05,88.31
1999-11-04,83.62
1999-11-03,81.50
1999-11-02,80.25


This is all commented out because it only needs to be done to build the DB.

In [None]:
# merged_df = AAPL_df

In [79]:
SP500_index = 100
target_index = 505
merged_df.head()

Unnamed: 0,AAPL,GOOGL,GOOG,MSFT,AMZN,FB,JPM,JNJ,XOM,BAC,...,GD,CL,GM,ANTM,EOG,AMT,AET,RTN,NOC,SYK
1999-11-01,77.62,,,92.37,69.13,,83.56,105.1,74.75,64.87,...,55.0,61.81,,,21.0,18.94,51.94,29.06,54.81,62.44
1999-11-02,80.25,,,92.56,66.44,,83.69,102.5,74.12,64.25,...,54.25,59.94,,,19.25,19.38,55.0,28.94,54.75,64.0
1999-11-03,81.5,,,92.0,65.81,,82.44,104.3,73.0,63.0,...,55.56,57.75,,,19.69,19.75,54.0,28.0,55.0,65.56
1999-11-04,83.62,,,91.75,63.06,,84.12,105.3,73.0,63.5,...,54.0,57.06,,,18.94,19.94,52.25,27.5,55.0,67.13
1999-11-05,88.31,,,91.56,64.94,,86.25,105.5,70.62,65.06,...,54.13,57.63,,,18.31,20.75,53.44,27.81,54.69,66.13


### Pull all SP500 stocks
This iterates through all the stocks in the list of SP500 stocks and calls the API function.

It prints out what stock its trying to pull and the time so it can be monitored, when an error occurs,
the index can be adjust above and the process restarted, skipping the erroneous stock.

from: Alpha Advantage API https://www.alphavantage.co/documentation/#dailyadj

In [81]:
# # PULLING ADJUSTED
# # It will skip those that do not return an accurate response

# while SP500_index < target_index:
#     index = SP500_index
#     SP500_index+=1
#     print(time.ctime())
#     print(f'Index: {index}, stock: {SP500_List[index]}')
#     try:
#         index_df = BuildDBColumnADJ(SP500_List[index])
#     except:
#         print(f'Skipped: {SP500_List[index]}')
#         continue
#     merged_df = pd.merge(merged_df, index_df, how='outer', left_index=True, right_index=True)
#     # Not Needed with Premium
#     # time.sleep(20)

In [87]:
# # PULLING CLOSING

# while SP500_index < target_index:
#     index = SP500_index
#     SP500_index+=1
#     print(time.ctime())
#     print(f'Index: {index}, stock: {SP500_List[index]}')
#     try:
#         index_df = BuildDBColumnCLOSE(SP500_List[index])
#     except:
#         print(f'Skipped: {SP500_List[index]}')
#         continue
#     merged_df = pd.merge(merged_df, index_df, how='outer', left_index=True, right_index=True)
#     # Not Needed with Premium
#     # time.sleep(20)

This is a record of all the stocks that did not return properly.

Missed Stocks: AGN(99), BBT(138), APC(179), RHT(235), COL(239), CXO(241), MYL(247), WLTW(262), CBS(265), LB(343), COG(406), COG(416), VAR(420), TMK(436), BF.B(491), SCG(496)

In [82]:
# missing_stocks = [99,138,179,235,239,241,247,262,343,406,416,420,436,491,496]
merged_df

Unnamed: 0,AAPL,GOOGL,GOOG,MSFT,AMZN,FB,JPM,JNJ,XOM,BAC,...,UA,PWR,CSRA,AIZ,EVHC,NAVI,RRC,PDCO,SIG,CHK
1999-11-01,77.62,,,92.37,69.13,,83.56,105.10,74.75,64.87,...,,27.44,,,,,3.875,44.00,25.50,
1999-11-02,80.25,,,92.56,66.44,,83.69,102.50,74.12,64.25,...,,27.38,,,,,4.125,44.06,25.50,
1999-11-03,81.50,,,92.00,65.81,,82.44,104.30,73.00,63.00,...,,28.13,,,,,3.875,44.13,25.50,
1999-11-04,83.62,,,91.75,63.06,,84.12,105.30,73.00,63.50,...,,31.88,,,,,3.813,41.13,26.00,
1999-11-05,88.31,,,91.56,64.94,,86.25,105.50,70.62,65.06,...,,31.69,,,,,3.688,40.50,25.62,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-02-03,172.90,2861.80,2853.01,301.25,2776.91,237.76,148.70,172.76,79.68,46.43,...,16.16,101.47,,151.80,,17.57,20.450,27.93,81.80,67.69
2022-02-04,172.39,2865.86,2860.32,305.94,3152.79,237.09,152.56,171.63,81.41,48.28,...,16.43,99.95,,153.07,,17.82,20.380,27.97,84.19,66.98
2022-02-07,171.66,2784.02,2778.76,300.95,3158.71,224.91,153.07,171.06,82.39,48.51,...,16.58,98.87,,152.74,,18.06,19.960,27.50,82.32,66.79
2022-02-08,174.83,2787.98,2784.26,304.56,3228.27,220.18,155.95,171.51,80.26,49.38,...,16.94,102.25,,156.04,,18.51,19.060,28.23,84.20,65.72


In [12]:
# Used to initialize the table  DO NOT OVERWRITE IT!!!!
# PULLING ADJUSTED
# merged_df.to_sql('SP500 Cons Adjusted', engine)

In [83]:
# Used to initialize the table  DO NOT OVERWRITE IT!!!!
# PULLING CLOSE
# merged_df.to_sql('SP500 Cons Closing', engine)

In [18]:
engine.table_names()

  """Entry point for launching an IPython kernel.


['SP500 CSSO', 'SP500 Cons Adjusted', 'SP500 Cons Closing', 'SP500_Cons']

### Pull Market Capitilization Data

from: Alpha Advantage API https://www.alphavantage.co/documentation/#dailyadj

In [14]:
def BuildCSSOShares(stock):
    url = f'https://www.alphavantage.co/query?function=BALANCE_SHEET&symbol={stock}&apikey={AVAPI}'
    response = requests.get(url)
    data = response.json()
    # print(json.dumps(data, indent=4, sort_keys=True))
    stock_dict = {}
    for qrpt in data['quarterlyReports']:
        csso = qrpt['commonStockSharesOutstanding']
        date = qrpt['fiscalDateEnding']
        stock_dict[date] = csso
    stock_df = pd.DataFrame.from_dict(stock_dict, orient='index')
    stock_df.rename(columns={0:stock}, inplace=True)
    return stock_df

Create the initial Dataframe to merge the rest of the stocks.

In [15]:
# Dangerous to activate, resets merged CSSO
# Merged_CSSO = BuildCSSOShares('AAPL')

In [11]:
CSSO_index = 100
CSSO_target_index = 505
Merged_CSSO.head()

NameError: name 'Merged_CSSO' is not defined

In [33]:
# Commented out, only needs to be done once

# while CSSO_index < CSSO_target_index:
#     index = CSSO_index
#     CSSO_index+=1
#     print(time.ctime())
#     print(f'Index: {index}, stock: {SP500_List[index]}')
#     try:
#         index_df = BuildCSSOShares(SP500_List[index])
#     except:
#         print(f'Skipped: {SP500_List[index]}')
#         continue
#     Merged_CSSO = pd.merge(Merged_CSSO, index_df, how='outer', left_index=True, right_index=True)
#     # Not Needed with Premium
#     # time.sleep(20)

Tue Feb  8 15:58:58 2022
Index: 100, stock: BK
Tue Feb  8 15:58:58 2022
Index: 101, stock: ITW
Tue Feb  8 15:58:59 2022
Index: 102, stock: CME
Tue Feb  8 15:58:59 2022
Index: 103, stock: AIG
Tue Feb  8 15:59:00 2022
Index: 104, stock: OXY
Tue Feb  8 15:59:00 2022
Index: 105, stock: MON
Tue Feb  8 15:59:01 2022
Index: 106, stock: ATVI
Tue Feb  8 15:59:03 2022
Index: 107, stock: DE
Tue Feb  8 15:59:06 2022
Index: 108, stock: DUK
Tue Feb  8 15:59:09 2022
Index: 109, stock: AMAT
Tue Feb  8 15:59:12 2022
Index: 110, stock: BDX
Tue Feb  8 15:59:14 2022
Index: 111, stock: MAR
Tue Feb  8 15:59:16 2022
Index: 112, stock: ADP
Tue Feb  8 15:59:17 2022
Index: 113, stock: EL
Tue Feb  8 15:59:19 2022
Index: 114, stock: CCL
Tue Feb  8 15:59:23 2022
Index: 115, stock: MET
Tue Feb  8 15:59:24 2022
Index: 116, stock: MU
Tue Feb  8 15:59:25 2022
Index: 117, stock: TJX
Tue Feb  8 15:59:25 2022
Index: 118, stock: SPG
Tue Feb  8 15:59:26 2022
Index: 119, stock: PSX
Tue Feb  8 15:59:27 2022
Index: 120, stock

In [37]:
# Add to DB, CAREFUL
# Merged_CSSO.to_sql('SP500 CSSO', engine)

In [20]:
engine.table_names()

  """Entry point for launching an IPython kernel.


['SP500 CSSO',
 'SP500 Cons Adjusted',
 'SP500 Cons Closing',
 'SP500_Cons',
 'SP500_MC']