# Obtaining a list of equities tickers and the corresponding timeseries

In [39]:
! pip install --quiet --upgrade yfinance
# yfinance is already installed in our env virtual environment


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\rajra\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [40]:
import os
import numpy as np
import pandas as pd
import yfinance as yf
import time as time 

In [41]:
start_date = '2020-09-01'
end_date  = '2024-09-01'
dates = '{}_{}'.format(start_date, end_date)

data_dir = 'data/'
dates_dir = data_dir + '/' + dates
stock_dir = dates_dir + '/stocks'
aggregated_dir = dates_dir + '/aggregated'

# order matters: outer directories must be created before nested directories
for dir in [data_dir, dates_dir, stock_dir, aggregated_dir]:
  if not os.path.exists(dir):
    os.mkdir(dir)

## Get list of companies in S&P500

We will obtain and store the list of the constituents from Wikipedia.

## Download price timeseries

Now that we have a list of ticker symbols, we can download the corresponding timeseries from Yahoo Finance.

In [42]:
data    = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S%26P_500_component_stocks')
table   = data[0]
symbols = list(table.Symbol.values)
symbols.append('SPY')
symbols.sort()

print("{} symbols in total".format(len(symbols)))

504 symbols in total


In [43]:
pd.options.mode.chained_assignment = None  # default='warn'

success_downloads = 0
failed_downloads = []

for symbol in symbols:
  try:
    df = yf.download(symbol, start=start_date, end=end_date)

    if df.empty:
      failed_downloads.append(symbol)
      print('Failed to download {} data'.format(symbol))
      continue

    # df = df[['Open', 'Adj Close', 'Volume']]
    
    # for multiindex columns
    # Extract specific columns for Ticker "A"
    filtered_df = df.loc[:, [("Open", symbol), ("Adj Close", symbol), ("Volume", symbol)]]
    filtered_df.columns = ["Open", "Adj Close", "Volume"]  # Simplify column names

    filtered_df.to_csv(os.path.join(stock_dir, "{}.csv".format(symbol)))
    success_downloads += 1
  except KeyError:
    print('Error for symbol {}'.format(symbol))
    pass
  time.sleep(0.5) # avoid rate limiting by Yahoo Finance API, can likely decrease this waiting period

[*********************100%***********************]  1 of 1 completed

1 Failed download:
['A']: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)"))


Failed to download A data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Failed to download AMTM data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Failed to download BF.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


Failed to download BRK.B data


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [44]:
# get the sector of each stock symbol
sectors = pd.DataFrame(columns=['Symbol', 'Sector'])
for symbol in symbols:
  try: 
    stock = yf.Ticker(symbol)
    sector = stock.info.get('sector')
    new_row = pd.DataFrame([{"Symbol": symbol, "Sector": sector}])
    sectors = pd.concat([sectors, new_row], ignore_index=True)
  except Exception as e:
    print('Error getting sector for symbol {}'.format(symbol))
    pass
  time.sleep(0.5) # avoid rate limiting by Yahoo Finance API, can likely decrease this waiting period
  
sector_dir = 'data_sectors/'
if not os.path.exists(sector_dir):
  os.mkdir(sector_dir)
sectors.to_csv(os.path.join(sector_dir, 'sectors.csv'))

In [45]:
print('\nSuccessfully stored {}/{} files'.format(success_downloads, len(symbols)))


Successfully stored 500/504 files


In [46]:
# Printing a sample dataframe
idx = np.random.randint(len(symbols))
print("History for {}".format(symbols[idx]))
df = pd.read_csv(os.path.join(stock_dir, symbols[idx]+".csv")).set_index('Date')
df.head()

History for CTSH


Unnamed: 0_level_0,Open,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-09-01,66.260002,62.805145,1704600
2020-09-02,67.449997,64.540085,2710900
2020-09-03,68.669998,62.945816,3366700
2020-09-04,67.400002,62.655087,2544900
2020-09-08,65.449997,61.520325,3102600


## Pre-process Financial Timeseries

In [47]:
# initialize an empty DateTime Index
index = pd.date_range(start=start_date, end=end_date, freq='D')

# initialize empty dataframes
df_price = pd.DataFrame(index=index, columns=symbols)         # adjusted closing prices
df_volume = pd.DataFrame(index=index, columns=symbols)          # stock volumes
df_returns = pd.DataFrame(index=index, columns=symbols)         # daily (percent) returns

In [48]:
# Aggregate all symbols into a price, volume, daily returns dataframes
for symbol in symbols:
    if symbol in failed_downloads:
      continue

    symbol_df = pd.read_csv(os.path.join(stock_dir, symbol+".csv")).set_index('Date')
    symbol_df.index = pd.to_datetime(symbol_df.index)

    adj_close_p = symbol_df['Adj Close']
    open_p = symbol_df['Open']

    df_price[symbol] = adj_close_p
    df_volume[symbol] = symbol_df['Volume']
    df_returns[symbol] = ((adj_close_p - open_p) / open_p) * 100

# calculate percent change
df_price_pct = df_price.pct_change()[1:]

  df_price_pct = df_price.pct_change()[1:]
  df_price_pct = df_price.pct_change()[1:]


In [49]:
df_price.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2020-09-01,,130.98259,77.500954,,98.863472,30.048393,225.714005,527.950012,110.601883,40.508659,...,196.519562,25.844528,84.67881,60.123051,32.622234,77.809166,88.891327,132.067307,287.160004,156.790894
2020-09-02,,128.268829,78.937714,,101.544983,30.55237,232.44722,533.799988,113.460991,41.0471,...,203.875931,25.844528,87.672226,62.754257,32.423676,79.189598,90.583176,134.408188,294.359985,159.964478
2020-09-03,,117.999535,77.190086,,97.917053,30.54286,224.735992,507.799988,109.074547,41.109917,...,197.528687,25.004868,85.73877,61.964901,32.357491,77.742516,89.057739,132.104935,269.25,153.646378
2020-09-04,,118.077621,77.190086,,96.645889,30.305138,221.999451,491.940002,109.372543,41.72015,...,196.57608,24.383532,86.092094,61.41235,32.332668,77.942451,88.03154,131.42807,260.549988,152.137009
2020-09-05,,,,,,,,,,,...,,,,,,,,,,


In [50]:
df_volume.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2020-09-01,,151948100.0,15783600.0,,7963100.0,930100.0,1914500.0,2404000.0,2787000.0,2805500.0,...,787900.0,3424000.0,2674300.0,2124400.0,22464100.0,633500.0,1618200.0,678564.0,254900.0,1273100.0
2020-09-02,,200119000.0,9139200.0,,5843700.0,1233000.0,1702800.0,2783400.0,5158000.0,3050500.0,...,752700.0,7513200.0,2435100.0,2908300.0,26413400.0,912900.0,1873300.0,804533.0,360500.0,1790200.0
2020-09-03,,257599600.0,8921900.0,,6096900.0,1480200.0,2350500.0,5837600.0,5880300.0,3940400.0,...,1006700.0,7682700.0,3209000.0,2834000.0,28817200.0,954000.0,1526300.0,1387719.0,416800.0,1794100.0
2020-09-04,,332607200.0,9444800.0,,5027700.0,1141300.0,1946400.0,3900300.0,3766400.0,4156800.0,...,645300.0,5325600.0,3249900.0,2518800.0,24632500.0,891700.0,2025000.0,574534.0,354700.0,1552500.0
2020-09-05,,,,,,,,,,,...,,,,,,,,,,


In [51]:
df_returns.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2020-09-01,,-1.338811,-18.864158,,-9.241282,-3.906643,-5.357038,2.514565,-6.102486,-9.0102,...,-4.010372,-14.308593,-2.757455,-12.953452,-17.931486,-3.198347,-7.00771,-2.960963,-0.048724,-2.438624
2020-09-02,,-6.774596,-14.263371,,-4.60781,-3.741745,-3.90772,-0.456879,-4.758672,-9.207917,...,-1.916708,-15.952754,1.625396,-8.294232,-17.349794,-3.356608,-5.75052,-1.535963,1.798306,-1.317414
2020-09-03,,-7.021093,-17.77792,,-9.919912,-5.527812,-8.308451,-3.467415,-10.115741,-10.299116,...,-8.922592,-19.313108,-3.858747,-13.745958,-17.455383,-6.98431,-9.512557,-4.847491,-7.525073,-7.038735
2020-09-04,,-1.659347,-16.170629,,-8.349083,-7.352068,-7.565702,-2.518576,-6.607,-9.422169,...,-6.840394,-18.6402,-2.511497,-13.454974,-18.062167,-5.79834,-9.152179,-4.311226,-3.08723,-4.412538
2020-09-05,,,,,,,,,,,...,,,,,,,,,,


## Obtain Percentage Change

We need to convert prices to percent change in price as opposed to the actual \$ price. This is because stocks with very similar prices can behave very differently and vice-versa.
For e.g., if a stock moves from \$100 to \$110, we want the price column to say 10% (indicating the change).

However, for volume, we will retain magnitude.

In [52]:
df_price_pct.head()

Unnamed: 0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
2020-09-02,,-0.020718,0.018539,,0.027123,0.016772,0.029831,0.011081,0.02585,0.013292,...,0.037433,0.0,0.03535,0.043764,-0.006087,0.017741,0.019033,0.017725,0.025073,0.020241
2020-09-03,,-0.080061,-0.022139,,-0.035727,-0.000311,-0.033174,-0.048707,-0.03866,0.00153,...,-0.031133,-0.032489,-0.022053,-0.012579,-0.002041,-0.018274,-0.01684,-0.017136,-0.085304,-0.039497
2020-09-04,,0.000662,0.0,,-0.012982,-0.007783,-0.012177,-0.031233,0.002732,0.014844,...,-0.004823,-0.024849,0.004121,-0.008917,-0.000767,0.002572,-0.011523,-0.005124,-0.032312,-0.009824
2020-09-05,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-09-06,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Removing NaNs

In [53]:
# Let's drop the dates where all the stocks are NaNs, ie., weekends/holidays where no trading occured
for df in [df_price, df_volume, df_returns, df_price_pct]:
  df.dropna(how='all', inplace=True)
  df.dropna(inplace=True, axis=1)
  print(True in pd.isna(df))

assert((df_price.index == df_volume.index).all())
assert((df_volume.index == df_returns.index).all())
assert((df_returns.index == df_price.index).all())

False
False
False
False


### Storing the cleaned dataframes

In [54]:
df_price.to_csv(os.path.join(aggregated_dir, "prices.csv"), index_label='date')
df_volume.to_csv(os.path.join(aggregated_dir, "volume.csv"), index_label='date')
df_returns.to_csv(os.path.join(aggregated_dir, "percent_return.csv"), index_label='date')
df_price_pct.to_csv(os.path.join(aggregated_dir, "prices_pct.csv"), index_label='date')