In [32]:
import pandas as pd
from pathlib import Path
import requests
import os
import yfinance as yf
from config import config

## List NASDAQ-100 symbols
In order to list NASDAQ-100 index symbols API endpoint was taken over form official NASDAQ website under https://www.nasdaq.com URL. 
For the request to be successful 'user-agent' header must be appended, otherwise the request is rejected.
Symbols are stored in seperate file in CSV format.
It is important to note that NASDAQ-100 index list changes over time.
The NASDAQ-100 is composed of the 100 largest non-financial companies listed on the NASDAQ stock exchange, based on market capitalization.
The NASDAQ-100 undergoes an annual rebalancing, typically in December.

In [33]:
tic_filename = Path('data/nasdaq_index_list.csv')
Path('data').mkdir(exist_ok=True)
if not tic_filename.exists():
    resp = requests.get(
        'https://api.nasdaq.com/api/quote/list-type/nasdaq100',
        headers={
            'accept': 'application/json',
            # Pretend to be a browser
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X' +\
            ' 10.15; rv:123.0) Gecko/20100101 Firefox/123.0',
        }
    )
    data = resp.json()['data']['data']
    df = pd.DataFrame(data['rows'], columns=data['headers'])
    df.to_csv(tic_filename, columns=['symbol', 'companyName'], index=False)
    tickers = df['symbol'].tolist()
else:
    tickers = pd.read_csv(tic_filename)['symbol'].tolist()

pd.Series(tickers).head()

0     CPRT
1     COST
2     AAPL
3     AMGN
4    CMCSA
dtype: object

## Download data via yfinance API
Data is downloaded with usage of yfinance Python SDK.
The data must be filtered out.
The data cannot contain zero values (except for Volume), otherwise zero division error might be thrown during indicators calculations.
Low cannot be equal to High, because calculation of William’s Oscillator will fail due to a zero division error.
Data is dowloaded for period from 'd_start' to 'd_end' provided in the configuration dictionary.
The data for each ticker is stored in a file named after the ticker's symbol.
The data is stored in CSV format.

In [34]:
yf_filename = os.path.abspath('data/ohlcv/{}.csv') # Template for filename for a ticker symbol
Path('data/ohlcv').mkdir(parents=True, exist_ok=True)
tick_len = len(tickers)
for k,ticker in enumerate(tickers):
    if not os.path.exists(yf_filename.format(ticker)):
        df = yf.Ticker(ticker).history(
            interval='1d',
            start=config['d_start'],
            end=config['d_end']
        )
        df.drop(['Dividends', 'Stock Splits'], axis=1, inplace=True) # Drop not needed columns
        df = df[
            (df['Close'] > 0) & 
            (df['High'] > 0) & 
            (df['Low'] > 0) & 
            (df['Open'] > 0) & 
            (df['Low'] != df['High'])
        ] # Filter out data that might trigger zero division error during indicators calculations
        df.to_csv(yf_filename.format(ticker))
        print('Downloaded data for %s ticker symbol. Progress: %.2f%%' % (ticker, (k+1)/tick_len*100))


## Filter tickers
List of tickers must be filtered out, because data length of each ticker must be long enough to process one episode. Additional number of days (32) is added to minimum data length, which is required for indicators calculations.

In [35]:
filtered_tic_filename = Path('data/nasdaq_index_list_filtered.csv')
min_dlen = config['eval_years'] * 252 + config['trading_days'] + 32
for ticker in tickers[:]:  # Creates a copy of tickers for the loop
    if len(pd.read_csv(yf_filename.format(ticker)).index) < min_dlen:
        print('Removed ' + ticker)
        tickers.remove(ticker)
pd.Series(tickers).to_csv(filtered_tic_filename, index=False)
len(tickers)

Removed GFS
Removed DASH
Removed CEG
Removed GEHC
Removed ABNB


96

## Tickers

In [44]:
tickers_table = pd.read_csv(tic_filename)
tickers_table = pd.concat([tickers_table.head(), tickers_table.tail()])
tickers_table.rename(columns={'companyName': 'Company name', 'symbol': 'Symbol'}, inplace=True)
tickers_head = tickers_table.copy().head()
tickers_head.loc[5] = ['...']*len(tickers_head.columns)
head_tail_tickers = pd.concat([tickers_head, tickers_table.tail()])
with open("tables/tickers.tex", "w") as file:
    # Write your text to the file
    file.write(head_tail_tickers.to_latex(index=False))
head_tail_tickers

Unnamed: 0,Symbol,Company name
0,CPRT,"Copart, Inc. (DE) Common Stock"
1,COST,Costco Wholesale Corporation Common Stock
2,AAPL,Apple Inc. Common Stock
3,AMGN,Amgen Inc. Common Stock
4,CMCSA,Comcast Corporation Class A Common Stock
5,...,...
96,AZN,AstraZeneca PLC American Depositary Shares
97,LULU,lululemon athletica inc. Common Stock
98,WBD,"Warner Bros. Discovery, Inc. Series A Common S..."
99,ADP,"Automatic Data Processing, Inc. Common Stock"


## Data shape  - Apple stock

In [42]:
appl_head = pd.read_csv(yf_filename.format('AAPL'), parse_dates=True).head()
appl_head.loc[5] = ['...']*len(appl_head.columns)
with open('tables/data_shape.tex', 'w') as file:
    file.write(appl_head.to_latex(index=False))
appl_head

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2014-01-02 00:00:00-05:00,17.352864,17.395022,17.238569,17.273232,234684800
1,2014-01-03 00:00:00-05:00,17.264797,17.291028,16.87663,16.893806,392467600
2,2014-01-06 00:00:00-05:00,16.783571,17.075553,16.663342,16.985929,412610800
3,2014-01-07 00:00:00-05:00,16.998114,17.049328,16.798255,16.864458,317209200
4,2014-01-08 00:00:00-05:00,16.826042,17.036832,16.822296,16.971254,258529600
5,...,...,...,...,...,...
