# Obtaining a list of equities tickers and the corresponding timeseries

In [23]:
! pip install --quiet yfinance

In [77]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
sns.set(rc={'figure.figsize':(10,8)})
import matplotlib.pyplot as plt

from datetime import datetime
import yfinance as yf

In [86]:
start_date = '2024-08-20'
end_date  = '2024-08-28'
dates = '{}_{}'.format(start_date, end_date)

data_dir = 'data/'
dates_dir = data_dir + '/' + dates
stock_dir = dates_dir + '/stocks'
aggregated_dir = dates_dir + '/aggregated'

# order matters: outer directories must be created before nested directories
for dir in [data_dir, dates_dir, stock_dir, aggregated_dir]:
  if not os.path.exists(dir):
    os.mkdir(dir)

## Get list of companies in S&P500

We will obtain and store the list of the constituents from Wikipedia.

## Download price timeseries

Now that we have a list of ticker symbols, we can download the corresponding timeseries from Yahoo Finance.

In [82]:
data    = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S%26P_500_component_stocks')
table   = data[0]
symbols = list(table.Symbol.values)
symbols.append('SPY')
symbols.sort()

print("{} symbols in total".format(len(symbols)))

504 symbols in total


In [88]:
pd.options.mode.chained_assignment = None  # default='warn'

success_downloads = 0
failed_downloads = []

for symbol in symbols:
  try:
    df = yf.download(symbol, start=start_date, end=end_date)

    if df.empty:
      failed_downloads.append(symbol)
      print('Failed to download {} data'.format(symbol))
      continue

    df = df[['Open', 'Adj Close', 'Volume']]

    df.to_csv(os.path.join(stock_dir, "{}.csv".format(symbol)))
    success_downloads += 1
  except KeyError:
    print('Error for symbol {}'.format(symbol))
    pass

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Failed to download BF.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


Failed to download BRK.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********


Successfully stored 502/504 files





In [104]:
print('\nSuccessfully stored {}/{} files'.format(success_downloads, len(symbols)))


Successfully stored 502/504 files


In [89]:
# Printing a sample dataframe
idx = np.random.randint(len(symbols))
print("History for {}".format(symbols[idx]))
df = pd.read_csv(os.path.join(stock_dir, symbols[idx]+".csv")).set_index('Date')
df.head()

History for ELV


Unnamed: 0_level_0,Open,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-08-20,544.299988,541.411865,510400
2024-08-21,543.450012,540.713989,558700
2024-08-22,543.0,541.69104,549400
2024-08-23,545.0,544.32312,429100
2024-08-26,546.940002,542.997131,358400


## Pre-process Financial Timeseries

In [90]:
# initialize an empty DateTime Index
index = pd.date_range(start=start_date, end=end_date, freq='D')

# initialize empty dataframes
df_price = pd.DataFrame(index=index, columns=symbols)         # adjusted closing prices
df_volume = pd.DataFrame(index=index, columns=symbols)          # stock volumes
df_returns = pd.DataFrame(index=index, columns=symbols)         # daily (percent) returns

In [92]:
# Aggregate all symbols into a price, volume, daily returns dataframes
for symbol in symbols:
    if symbol in failed_downloads:
      continue

    symbol_df = pd.read_csv(os.path.join(stock_dir, symbol+".csv")).set_index('Date')
    symbol_df.index = pd.to_datetime(symbol_df.index)

    adj_close_p = symbol_df['Adj Close']
    open_p = symbol_df['Open']

    df_price[symbol] = adj_close_p
    df_volume[symbol] = symbol_df['Volume']
    df_returns[symbol] = ((adj_close_p - open_p) / open_p) * 100

# calculate percent change
df_price_pct = df_price.pct_change()[1:]

In [93]:
df_price.head()

Unnamed: 0,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2024-08-20,139.75,10.29,226.509995,196.149994,117.379997,110.769997,102.800003,330.369995,562.25,222.614471,...,281.470001,29.912493,76.019997,59.990002,114.580002,133.794418,136.270065,111.639999,344.850006,183.600006
2024-08-21,139.990005,10.4,226.399994,196.529999,117.68,111.389999,104.599998,333.600006,565.789978,226.608749,...,283.679993,30.210524,77.360001,59.959999,113.849998,135.440063,137.066101,112.129997,345.0,182.899994
2024-08-22,140.220001,10.14,224.529999,196.369995,115.449997,112.099998,106.510002,330.570007,557.440002,221.040649,...,285.140015,30.061508,77.089996,59.919998,114.730003,135.061066,136.081009,113.419998,342.160004,182.169998
2024-08-23,140.869995,10.39,226.839996,197.550003,116.849998,112.690002,109.019997,333.269989,558.299988,227.49527,...,281.459991,31.154293,77.370003,60.119999,116.32,136.557083,134.847168,115.050003,351.619995,180.899994
2024-08-24,,,,,,,,,,,...,,,,,,,,,,


In [94]:
df_volume.head()

Unnamed: 0,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2024-08-20,1227700.0,23537400.0,30299000.0,3631800.0,3940000.0,5822400.0,700600.0,1790100.0,1219600.0,3994200.0,...,307900.0,3717400.0,1839300.0,2669500.0,15632000.0,772300.0,1713300.0,958000.0,431300.0,1186400.0
2024-08-21,2263000.0,20233700.0,34765500.0,4250400.0,5028500.0,4081400.0,1013200.0,1233900.0,1401700.0,5595100.0,...,285800.0,3161200.0,2302300.0,1574000.0,11752400.0,1932300.0,1583200.0,826900.0,310700.0,1246800.0
2024-08-22,2634500.0,25624700.0,43695300.0,3676700.0,4875500.0,3690400.0,1144900.0,1679200.0,1627700.0,3708100.0,...,418300.0,3398300.0,1133700.0,1581100.0,10609500.0,898400.0,1393600.0,1164500.0,224900.0,1135800.0
2024-08-23,1735700.0,24640000.0,38677300.0,4720300.0,4998400.0,4511600.0,1514500.0,1518700.0,2023500.0,3208000.0,...,392100.0,3294900.0,1479900.0,2069200.0,10381400.0,693100.0,1657800.0,788400.0,196300.0,1262400.0
2024-08-24,,,,,,,,,,,...,,,,,,,,,,


In [95]:
df_returns.head()

Unnamed: 0,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2024-08-20,-0.604556,-0.193991,0.327763,-0.492087,-1.31159,-0.939012,-0.165095,0.133362,-0.39505,-0.600793,...,0.0,-0.886374,-0.210034,-0.016664,-3.430257,-0.450577,-0.460138,-0.143116,-0.349648,-0.477011
2024-08-21,0.007151,0.289293,-0.05298,0.045814,-0.304982,0.378479,1.356591,0.834243,0.65109,-1.26841,...,0.162415,0.06798,1.058129,0.016678,-1.008611,0.624114,-0.177625,0.116069,-1.058247,-0.74348
2024-08-22,-3.549321,-2.593655,-1.43114,-0.324856,-1.894972,0.277303,1.680193,-0.994342,-1.698204,-2.055718,...,0.670818,-1.113459,-0.349023,-0.133336,0.834949,-0.690393,-1.340523,0.567476,-1.326568,-0.676084
2024-08-23,-0.240784,1.564035,0.522907,0.152093,0.18004,0.258008,1.925951,0.485432,-1.264484,0.728476,...,-1.587416,2.481228,0.781563,-0.083097,1.121447,0.115168,-1.865101,0.921055,2.164623,-1.071862
2024-08-24,,,,,,,,,,,...,,,,,,,,,,


## Obtain Percentage Change

We need to convert prices to percent change in price as opposed to the actual \$ price. This is because stocks with very similar prices can behave very differently and vice-versa.
For e.g., if a stock moves from \$100 to \$110, we want the price column to say 10% (indicating the change).

However, for volume, we will retain magnitude.

In [None]:
df_price_pct.head()

### Removing NaNs

In [98]:
# Let's drop the dates where all the stocks are NaNs, ie., weekends/holidays where no trading occured
for df in [df_price, df_volume, df_returns, df_price_pct]:
  df.dropna(how='all', inplace=True)
  df.dropna(inplace=True, axis=1)
  print(True in pd.isna(df))

assert((df_price.index == df_volume.index).all())
assert((df_volume.index == df_returns.index).all())
assert((df_returns.index == df_price.index).all())

### Storing the cleaned dataframes

In [103]:
df_price.to_csv(os.path.join(aggregated_dir, "prices.csv"), index_label='date')
df_volume.to_csv(os.path.join(aggregated_dir, "volume.csv"), index_label='date')
df_returns.to_csv(os.path.join(aggregated_dir, "percent_return.csv"), index_label='date')
df_price_pct.to_csv(os.path.join(aggregated_dir, "prices_pct.csv"), index_label='date')