# Download stock list

In [34]:
pip install datapackage yfinance lxml pandas_datareader html5lib ta-lib

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datapackage import Package
package = Package('https://datahub.io/core/nyse-other-listings/datapackage.json')
package.resource_names

['validation_report',
 'nyse-listed_csv',
 'other-listed_csv',
 'nyse-listed_json',
 'other-listed_json',
 'nyse-other-listings_zip',
 'nyse-listed_csv_preview',
 'other-listed_csv_preview',
 'nyse-listed',
 'other-listed']

In [6]:
import pandas as pd
from os import path
import yfinance as yf

file_name = 'data/nyse.csv'

# If we don't have data locally, download from remote source
if not path.exists(file_name):
    # filter data file
    filter_fn = lambda resource: resource.name == 'nyse-listed_csv'
    resource = list(filter(filter_fn, package.resources))[0]
    df = pd.DataFrame.from_dict(resource.read())
    df.columns=["Symbol", "Description"]
    df.to_csv(file_name, index=False)
    
stocks_df = pd.read_csv(file_name)
stocks_df.index = stocks_df['Symbol'].values
print(stocks_df.head())
print(stocks_df.tail())
print(stocks_df.shape)


     Symbol                                        Description
A         A            Agilent Technologies, Inc. Common Stock
AA       AA                            Alcoa Inc. Common Stock
AA$B   AA$B  Alcoa Inc. Depository Shares Representing 1/10...
AAC     AAC                    AAC Holdings, Inc. Common Stock
AAN     AAN                         Aaron's, Inc. Common Stock
     Symbol                                        Description
ZPIN   ZPIN  Zhaopin Limited American Depositary Shares, ea...
ZQK     ZQK                      Quiksilver, Inc. Common Stock
ZTR     ZTR   Zweig Total Return Fund, Inc. (The) Common Stock
ZTS     ZTS                   Zoetis Inc. Class A Common Stock
ZX       ZX  China Zenix Auto International Limited America...
(3298, 2)


In [14]:
import yfinance as yf
import numpy as np
import sys

file_name_with_vol = file_name.replace(".csv", "-volume.csv")

# If we don't have data locally, download from remote source
if not path.exists(file_name_with_vol):

    tickers = []
    batch_counter = 0
    batch_size = 100
    total_symbols = stocks_df.shape[0]
    stocks_vol_df = pd.DataFrame(stocks_df)
    
    for stock in stocks_df['Symbol'].values:
        # We are going to download 100 ticker information at a time
        if len(tickers) >= 100 or (batch_counter * batch_size + len(tickers)) >= total_symbols:
            
            batch_counter = batch_counter + 1
            
            # Download information
            tickers_data = yf.Tickers(' '.join(tickers)).tickers
            print('Batch ', batch_counter, 'with', len(tickers_data), ' tickers')

            # Store last known volume
            for ticker in tickers:
                try:
                    print('Processing yF:', getattr(tickers_data, ticker), ', Ticker name:', ticker)
                    stocks_vol_df.loc[ticker, 'Last Volume'] = getattr(tickers_data, ticker).info['averageVolume']
                except:
                    # It is possible that some of th tickets do not have the information we are looking for
                    # and they may throw exception while processing
                    print('Filed to process', ticker, sys.exc_info())

            print('Downloaded volume info for', tickers)

            # clear the list for next batch
            tickers = []

        tickers.append(stock)

    stocks_vol_df.to_csv(file_name_with_vol, index=False)

stocks_vol_df = pd.read_csv(file_name_with_vol).dropna()
stocks_vol_df.index = stocks_vol_df['Symbol'].values
print(stocks_vol_df.head())
print(stocks_vol_df.shape)


    Symbol                                        Description  Last Volume
A        A            Agilent Technologies, Inc. Common Stock    1712146.0
AA      AA                            Alcoa Inc. Common Stock    4695503.0
AAP    AAP  Advance Auto Parts Inc Advance Auto Parts Inc W/I    1065904.0
AAT    AAT           American Assets Trust, Inc. Common Stock     275667.0
ABB    ABB                               ABB Ltd Common Stock    1777283.0
(1511, 3)


In [16]:
# We are going to use stocks with last known volume more than 1,000,000
stocks_vol_df = stocks_vol_df[stocks_vol_df['Last Volume'] >= 1000000]
print(stocks_vol_df.head())
print(stocks_vol_df.shape)

     Symbol                                        Description  Last Volume
A         A            Agilent Technologies, Inc. Common Stock    1712146.0
AA       AA                            Alcoa Inc. Common Stock    4695503.0
AAP     AAP  Advance Auto Parts Inc Advance Auto Parts Inc W/I    1065904.0
ABB     ABB                               ABB Ltd Common Stock    1777283.0
ABBV   ABBV                           AbbVie Inc. Common Stock    8004485.0
(455, 3)


In [35]:
start_date = '2010-01-01'
end_date = '2019-12-31'
data_folder = 'data/'

for ticker in stocks_vol_df.index.values:
    f_name = data_folder + ticker + '.csv'
    if not path.exists(f_name):
        data = yf.download(ticker, start=start_date, end=end_date)
        data.to_csv(f_name)
    else:
        print('Data exists for:', ticker)

Data exists for: A
Data exists for: AA
Data exists for: AAP
Data exists for: ABB
Data exists for: ABBV
Data exists for: ABEV
Data exists for: ABR
Data exists for: ABT
Data exists for: ACM
Data exists for: ADM
Data exists for: ADPT
Data exists for: ADT
Data exists for: AEE
Data exists for: AEG
Data exists for: AEM
Data exists for: AEP
Data exists for: AES
Data exists for: AFL
Data exists for: AGI
Data exists for: AKS
Data exists for: ALB
Data exists for: ALL
Data exists for: ALLY
Data exists for: ALSN
Data exists for: AM
Data exists for: AMC
Data exists for: AME
Data exists for: AMH
Data exists for: AMT
Data exists for: AMX
Data exists for: ANF
Data exists for: AOS
Data exists for: APA
Data exists for: APD
Data exists for: AR
Data exists for: ARI
Data exists for: ARMK
Data exists for: ASB
Data exists for: AUY
Data exists for: AVP
Data exists for: AXL
Data exists for: AXP
Data exists for: AXTA
Data exists for: AZN
Data exists for: BA
Data exists for: BABA
Data exists for: BAC
Data exists

In [31]:
# Check amount of data downloaded
!du -h ./data

0	./data/.ipynb_checkpoints
111M	./data


In [38]:
stocks_vol_df['Last Volume'].describe()

count    4.550000e+02
mean     4.059946e+06
std      6.971949e+06
min      1.004216e+06
25%      1.422709e+06
50%      2.142141e+06
75%      4.032960e+06
max      8.536380e+07
Name: Last Volume, dtype: float64

In [68]:
import talib

# Change window
N = 10

for ticker in stocks_vol_df.index.values:
    stock_price_df = pd.read_csv(data_folder + ticker + '.csv')
    
    # Calculate changes
    for index in range(1, N + 1):
        stock_price_df['N - ' + str(index)] = stock_price_df['Adj Close'] - stock_price_df['Adj Close'].shift(index)
    
    # Create label
    stock_price_df['Label'] = stock_price_df['N - 1'].shift(-1)

    # calculate indicators
    stock_price_df['TRIMA'] = talib.TRIMA(stock_price_df['Close'])
    stock_price_df['SAR'] = talib.SAR(stock_price_df['High'], stock_price_df['Low'])
    
    macd, macdsignal, macdhist = talib.MACD(stock_price_df['Close'])
    stock_price_df['MACD-main'] = macd
    stock_price_df['MACD-signal'] = macdsignal
    stock_price_df['MACD-hist'] = macdhist
    
    stock_price_df['RSI'] = talib.RSI(stock_price_df['Close'])
    
    slowk, slowd = talib.STOCH(stock_price_df['High'], stock_price_df['Low'], stock_price_df['Close'])
    stock_price_df['STOCH-k'] = slowk
    stock_price_df['STOCH-d'] = slowd
    
    stock_price_df['AD'] = talib.AD(stock_price_df['High'], stock_price_df['Low'], stock_price_df['Close'], stock_price_df['Volume'])
    stock_price_df['ATR'] = talib.ATR(stock_price_df['High'], stock_price_df['Low'], stock_price_df['Close'])
    
    # save
    stock_price_df.to_pickle(data_folder + ticker + '.pkl')
    print('Saved', data_folder + ticker + '.pkl')


Saved data/A.pkl
Saved data/AA.pkl
Saved data/AAP.pkl
Saved data/ABB.pkl
Saved data/ABBV.pkl
Saved data/ABEV.pkl
Saved data/ABR.pkl
Saved data/ABT.pkl
Saved data/ACM.pkl
Saved data/ADM.pkl
Saved data/ADPT.pkl
Saved data/ADT.pkl
Saved data/AEE.pkl
Saved data/AEG.pkl
Saved data/AEM.pkl
Saved data/AEP.pkl
Saved data/AES.pkl
Saved data/AFL.pkl
Saved data/AGI.pkl
Saved data/AKS.pkl
Saved data/ALB.pkl
Saved data/ALL.pkl
Saved data/ALLY.pkl
Saved data/ALSN.pkl
Saved data/AM.pkl
Saved data/AMC.pkl
Saved data/AME.pkl
Saved data/AMH.pkl
Saved data/AMT.pkl
Saved data/AMX.pkl
Saved data/ANF.pkl
Saved data/AOS.pkl
Saved data/APA.pkl
Saved data/APD.pkl
Saved data/AR.pkl
Saved data/ARI.pkl
Saved data/ARMK.pkl
Saved data/ASB.pkl
Saved data/AUY.pkl
Saved data/AVP.pkl
Saved data/AXL.pkl
Saved data/AXP.pkl
Saved data/AXTA.pkl
Saved data/AZN.pkl
Saved data/BA.pkl
Saved data/BABA.pkl
Saved data/BAC.pkl
Saved data/BAH.pkl
Saved data/BAS.pkl
Saved data/BAX.pkl
Saved data/BBT.pkl
Saved data/BBY.pkl
Saved data

In [None]:
# Find missing data
