# Obtaining a list of equities tickers and the corresponding timeseries

In [1]:
! pip install --quiet --upgrade yfinance
# yfinance is already installed in our env virtual environment

In [21]:
import os
import numpy as np
import pandas as pd
import yfinance as yf
import time as time 

In [3]:
start_date = '2023-09-01'
end_date  = '2024-09-01'
dates = '{}_{}'.format(start_date, end_date)

data_dir = 'data/'
dates_dir = data_dir + '/' + dates
stock_dir = dates_dir + '/stocks'
aggregated_dir = dates_dir + '/aggregated'

# order matters: outer directories must be created before nested directories
for dir in [data_dir, dates_dir, stock_dir, aggregated_dir]:
  if not os.path.exists(dir):
    os.mkdir(dir)

## Get list of companies in S&P500

We will obtain and store the list of the constituents from Wikipedia.

## Download price timeseries

Now that we have a list of ticker symbols, we can download the corresponding timeseries from Yahoo Finance.

In [4]:
data    = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S%26P_500_component_stocks')
table   = data[0]
symbols = list(table.Symbol.values)
symbols.append('SPY')
symbols.sort()

print("{} symbols in total".format(len(symbols)))

504 symbols in total


In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

success_downloads = 0
failed_downloads = []

for symbol in symbols:
  try:
    df = yf.download(symbol, start=start_date, end=end_date)

    if df.empty:
      failed_downloads.append(symbol)
      print('Failed to download {} data'.format(symbol))
      continue

    # df = df[['Open', 'Adj Close', 'Volume']]
    
    # for multiindex columns
    # Extract specific columns for Ticker "A"
    filtered_df = df.loc[:, [("Open", symbol), ("Adj Close", symbol), ("Volume", symbol)]]
    filtered_df.columns = ["Open", "Adj Close", "Volume"]  # Simplify column names

    filtered_df.to_csv(os.path.join(stock_dir, "{}.csv".format(symbol)))
    success_downloads += 1
  except KeyError:
    print('Error for symbol {}'.format(symbol))
    pass
  time.sleep(0.5) # avoid rate limiting by Yahoo Finance API, can likely decrease this waiting period

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Failed to download AMTM data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Failed to download BF.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


Failed to download BRK.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Error getting sector for symbol RSG


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['RTX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download RTX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['RVTY']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download RVTY data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SBAC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SBAC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SBUX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SBUX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SCHW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SCHW data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SHW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SHW data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SJM']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SJM data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SLB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SLB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SMCI']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SMCI data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SNA']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SNA data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SNPS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SNPS data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SO']: J%ticker%NDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SOLV']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SOLV data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SPG']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SPG data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SPGI']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SPGI data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SPY']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SPY data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SRE']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SRE data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['STE']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download STE data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['STLD']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download STLD data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['STT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download STT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['STX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download STX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['STZ']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download STZ data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SW data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SWK']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SWK data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SWKS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SWKS data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SYF']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SYF data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SYK']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SYK data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SYY']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download SYY data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['T']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download T data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TAP']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TAP data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TDG']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TDG data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TDY']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TDY data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TECH']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TECH data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TEL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TEL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TER']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TER data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TFC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TFC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TFX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TFX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TGT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TGT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TJX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TJX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TMO']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TMO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TMUS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TMUS data


Failed to get ticker 'TPL' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TPL']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


Failed to download TPL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TPR']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TPR data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TRGP']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TRGP data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TRMB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TRMB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TROW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TROW data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TRV']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TRV data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TSCO']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TSCO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TSLA']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TSLA data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TSN']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TSN data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TTWO']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TTWO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TXN']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TXN data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TXT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TXT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TYL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download TYL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UAL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UAL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UBER']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UBER data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UDR']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UDR data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UHS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UHS data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ULTA']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download ULTA data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UNH']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UNH data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UNP']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UNP data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UPS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download UPS data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['URI']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download URI data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['USB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download USB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['V']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download V data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VICI']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VICI data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VLO']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VLO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VLTO']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VLTO data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VMC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VMC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VRSK']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VRSK data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VRSN']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VRSN data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VRTX']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VRTX data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VST']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VST data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VTR']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VTR data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VTRS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VTRS data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VZ']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download VZ data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WAB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WAB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WAT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WAT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WBA']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WBA data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WBD']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WBD data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WDC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WDC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WEC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WEC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WELL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WELL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WFC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WFC data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WM']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WM data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WMB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WMB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WMT']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WMT data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WRB']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WRB data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WST']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WST data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WTW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WTW data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WY']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WY data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['WYNN']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download WYNN data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['XEL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download XEL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['XOM']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download XOM data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['XYL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download XYL data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['YUM']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download YUM data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZBH']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download ZBH data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZBRA']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download ZBRA data


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZTS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


Failed to download ZTS data


In [23]:
# get the sector of each stock symbol
sectors = pd.DataFrame(columns=['Symbol', 'Sector'])
for symbol in symbols:
  try: 
    stock = yf.Ticker(symbol)
    sector = stock.info.get('sector')
    new_row = pd.DataFrame([{"Symbol": symbol, "Sector": sector}])
    sectors = pd.concat([sectors, new_row], ignore_index=True)
  except Exception as e:
    print('Error getting sector for symbol {}'.format(symbol))
    pass
  time.sleep(0.5) # avoid rate limiting by Yahoo Finance API, can likely decrease this waiting period
  
sector_dir = 'data_sectors/'
if not os.path.exists(sector_dir):
  os.mkdir(sector_dir)
sectors.to_csv(os.path.join(sector_dir, 'sectors.csv'))

In [6]:
print('\nSuccessfully stored {}/{} files'.format(success_downloads, len(symbols)))


Successfully stored 401/504 files


In [7]:
# Printing a sample dataframe
idx = np.random.randint(len(symbols))
print("History for {}".format(symbols[idx]))
df = pd.read_csv(os.path.join(stock_dir, symbols[idx]+".csv")).set_index('Date')
df.head()

History for EXC


Unnamed: 0_level_0,Open,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-09-01,40.349998,38.033531,3931300
2023-09-05,39.990002,37.577011,3584100
2023-09-06,39.560001,37.909889,5607400
2023-09-07,40.23,38.585152,4917900
2023-09-08,40.66,38.594666,4545800


## Pre-process Financial Timeseries

In [8]:
# initialize an empty DateTime Index
index = pd.date_range(start=start_date, end=end_date, freq='D')

# initialize empty dataframes
df_price = pd.DataFrame(index=index, columns=symbols)         # adjusted closing prices
df_volume = pd.DataFrame(index=index, columns=symbols)          # stock volumes
df_returns = pd.DataFrame(index=index, columns=symbols)         # daily (percent) returns

In [9]:
# Aggregate all symbols into a price, volume, daily returns dataframes
for symbol in symbols:
    if symbol in failed_downloads:
      continue

    symbol_df = pd.read_csv(os.path.join(stock_dir, symbol+".csv")).set_index('Date')
    symbol_df.index = pd.to_datetime(symbol_df.index)

    adj_close_p = symbol_df['Adj Close']
    open_p = symbol_df['Open']

    df_price[symbol] = adj_close_p
    df_volume[symbol] = symbol_df['Volume']
    df_returns[symbol] = ((adj_close_p - open_p) / open_p) * 100

# calculate percent change
df_price_pct = df_price.pct_change()[1:]

  df_price_pct = df_price.pct_change()[1:]
  df_price_pct = df_price.pct_change()[1:]


In [10]:
df_price.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2023-09-01,120.845276,188.2883,141.556671,132.690002,100.282532,73.228699,321.222534,563.210022,179.301651,76.536621,...,,,,,,,,,,
2023-09-02,,,,,,,,,,,...,,,,,,,,,,
2023-09-03,,,,,,,,,,,...,,,,,,,,,,
2023-09-04,,,,,,,,,,,...,,,,,,,,,,
2023-09-05,117.693047,188.52681,139.550812,142.289993,98.36171,72.135162,319.67395,564.880005,178.42691,75.663719,...,,,,,,,,,,


In [11]:
df_volume.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2023-09-01,840700.0,45732600.0,3357800.0,4418300.0,2641100.0,880200.0,1732700.0,2232300.0,1879000.0,3572100.0,...,,,,,,,,,,
2023-09-02,,,,,,,,,,,...,,,,,,,,,,
2023-09-03,,,,,,,,,,,...,,,,,,,,,,
2023-09-04,,,,,,,,,,,...,,,,,,,,,,
2023-09-05,1270600.0,45280000.0,3911100.0,21052800.0,4284000.0,1285200.0,1736600.0,2349700.0,2488000.0,3243800.0,...,,,,,,,,,,


In [12]:
df_returns.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2023-09-01,-1.503567,-0.634179,-3.938199,-0.755418,-3.014961,-5.413717,-1.480587,-0.28151,-1.806328,-4.137494,...,,,,,,,,,,
2023-09-02,,,,,,,,,,,...,,,,,,,,,,
2023-09-03,,,,,,,,,,,...,,,,,,,,,,
2023-09-04,,,,,,,,,,,...,,,,,,,,,,
2023-09-05,-3.244782,0.131087,-6.026389,0.807651,-4.298783,-6.135115,-2.46111,1.25475,-1.649811,-4.873374,...,,,,,,,,,,


## Obtain Percentage Change

We need to convert prices to percent change in price as opposed to the actual \$ price. This is because stocks with very similar prices can behave very differently and vice-versa.
For e.g., if a stock moves from \$100 to \$110, we want the price column to say 10% (indicating the change).

However, for volume, we will retain magnitude.

In [13]:
df_price_pct.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2023-09-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2023-09-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2023-09-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2023-09-05,-0.026085,0.001267,-0.01417,0.072349,-0.019154,-0.014933,-0.004821,0.002965,-0.004879,-0.011405,...,,,,,,,,,,
2023-09-06,-0.006654,-0.035793,-0.003354,-0.003654,0.006741,0.007646,0.000429,-0.005205,0.003525,-0.018002,...,,,,,,,,,,


### Removing NaNs

In [14]:
# Let's drop the dates where all the stocks are NaNs, ie., weekends/holidays where no trading occured
for df in [df_price, df_volume, df_returns, df_price_pct]:
  df.dropna(how='all', inplace=True)
  df.dropna(inplace=True, axis=1)
  print(True in pd.isna(df))

assert((df_price.index == df_volume.index).all())
assert((df_volume.index == df_returns.index).all())
assert((df_returns.index == df_price.index).all())

False
False
False
False


### Storing the cleaned dataframes

In [15]:
df_price.to_csv(os.path.join(aggregated_dir, "prices.csv"), index_label='date')
df_volume.to_csv(os.path.join(aggregated_dir, "volume.csv"), index_label='date')
df_returns.to_csv(os.path.join(aggregated_dir, "percent_return.csv"), index_label='date')
df_price_pct.to_csv(os.path.join(aggregated_dir, "prices_pct.csv"), index_label='date')