In [50]:
import numpy as np
import pandas as pd
from pyquery import PyQuery as pq
import time
import datetime
import requests
import urllib.request
import os
import calendar
import pickle
from selenium import webdriver

In [2]:
# input
ETF_data = pd.read_csv('Emerging Asia Pacific ETF List (114).csv')

# filter the inception date
ETF_data['Inception'] = pd.to_datetime(ETF_data['Inception'], format="%d/%m/%Y")
date = datetime.datetime(2016, 1, 1, 0, 0)
ETF_data = ETF_data.loc[ETF_data['Inception'] < date,:].reset_index()

# extract symbol and name of each ETF
symbol_list = ETF_data['Symbol']
name_list = ETF_data['ETF Name']

In [3]:
with open('ETF_dic.pkl','rb') as f:
    dic = pickle.load(f)

In [4]:
def get_the_last_day_of_month(start_date,end_date):
    ans = []
    temp = start_date - datetime.timedelta(1)
    year = start_date.year
    month = start_date.month
    
    while (temp < end_date):
        temp = temp + datetime.timedelta(calendar.monthrange(year,month)[1])
        ans.append(temp)
        month += 1
        if(month > 12):
            month = 1
            year += 1
    
    # check for trading day
    for i in range(len(ans)):
        if ans[i].isoweekday()>5:
            ans[i]=ans[i]+datetime.timedelta(days=5-ans[i].isoweekday())
    return ans

In [6]:
last_day_of_month = get_the_last_day_of_month(datetime.datetime(2016,1,1,0,0),datetime.datetime(2018,12,31,0,0))

In [28]:
def get_single_symbol(symbol,chrome_filepath,browser,wait = 3):
    
    # check directory
    if not os.path.isdir('./data'):
        os.mkdir('./data')
    
    filepath = f'./data/{symbol}.csv'
    if os.path.isfile(filepath):
        print('File exist')
        return 
    
    url = f'https://finance.yahoo.com/quote/{symbol}/history?p={symbol}'
    browser.get(url)
    browser.find_element_by_css_selector("[class='C(t) O(n):f Tsh($actionBlueTextShadow) Bd(n) Bgc(t) Fz(14px) Pos(r) T(-1px) Bd(n):f Bxsh(n):f Cur(p) W(190px)']").click()
    start_date = browser.find_element_by_css_selector("[class='Bdrs(0) Bxsh(n)! Fz(s) Bxz(bb) D(ib) Bg(n) Pend(5px) Px(8px) Py(0) H(34px) Lh(34px) Bd O(n):f O(n):h Bdc($c-fuji-grey-c) Bdc($c-fuji-blue-1-b):f M(0) Pstart(10px) Bgc(white) W(90px) Mt(5px)']")
    start_date.clear()
    start_date.send_keys('1/1/2016')
    end_date = browser.find_element_by_css_selector("[class='Bdrs(0) Bxsh(n)! Fz(s) Bxz(bb) D(ib) Bg(n) Pend(5px) Px(8px) Py(0) H(34px) Lh(34px) Bd O(n):f O(n):h Bdc($c-fuji-grey-c) Bdc($c-fuji-blue-1-b):f M(0) Pstart(10px) Bgc(white) W(90px) Mt(5px) Mstart(15px)']")
    end_date.clear()
    end_list = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S").split(' ')[0].split('/')
    end_date.send_keys(f'{end_list[1]}/{end_list[2]}/{end_list[0]}')

    try:
        browser.find_element_by_css_selector("[class=' Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Miw(80px)! Fl(start)']").click()
    except:
        browser.find_element_by_css_selector("[class=' Bd Bdc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) D(ib) C($c-fuji-blue-1-b) Bdc($actionBlueHover):h C($actionBlueHover):h Cur(p) Td(n)  Py(8px) cancel Miw(80px)! Fl(end)']").click()
        browser.find_element_by_css_selector("[class='C(t) O(n):f Tsh($actionBlueTextShadow) Bd(n) Bgc(t) Fz(14px) Pos(r) T(-1px) Bd(n):f Bxsh(n):f Cur(p) W(190px)']").click()
        browser.find_element_by_css_selector("[class=' Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Miw(80px)! Fl(start)']").click()

    browser.find_element_by_css_selector("[class=' Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Fl(end)']").click()
    time.sleep(5)
    browser.find_element_by_css_selector("[class='Fl(end) Mt(3px) Cur(p)']").click()
    time.sleep(wait)

    while not os.path.isfile(filepath):
        browser.find_element_by_css_selector("[class='Fl(end) Mt(3px) Cur(p)']").click()
        time.sleep(wait)

    print('Download',symbol)

In [8]:
def get_all_symbol_csv(dic,chrome_filepath):
    
    # check directory
    if not os.path.isdir('./data'):
        os.mkdir('./data')

    options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': './data'}
    options.add_experimental_option('prefs', prefs)
    browser=webdriver.Chrome(chrome_filepath,options=options) 
    
    for s in dic['symbol']:
        get_single_symbol(s,check_download_file,browser)

    browser.quit()

In [222]:
get_all_symbol_csv(dic,'/Users/lou/Downloads/chromedriver')

File exist
Download INCO
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist
File exist


In [75]:
def combine_data(dic):
        
    for i,s in enumerate(dic['symbol']):
        data = pd.read_csv(f'./data/{s}.csv')
        data['Date'] = pd.to_datetime(data['Date'], format="%Y-%m-%d")
        
#         # Mask
#         mask = [False for i in range(data['Date'].shape[0])]
#         for day in last_day_of_month:
#             temp = data['Date'] == day
#             mask |= temp
#         data = data[mask]
        
        data_close = data[['Date','Adj Close']].reset_index(drop=True)
        data_close = data_close.rename(columns={'Adj Close':s})
        
        if(i == 0):
            com_data = data_close.copy()
        else:
            com_data = com_data.merge(data_close,on='Date',how='left')
        
    # check directory
    if not os.path.isdir('./Combine_data'):
        os.mkdir('./Combine_data')
    com_data.to_csv('./Combine_data/com_data.csv')
    
    return com_data

In [52]:
def check_download_file(com_data,chrome_filepath):
    options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': './data'}
    options.add_experimental_option('prefs', prefs)
    browser=webdriver.Chrome(chrome_filepath,options=options) 
    
    for i,s in enumerate(dic['symbol']):
        data = com_data[s].values
        if (np.sum(np.isnan(data)) > 10):
#             os.remove(f'./data/{s}.csv')
#             get_single_symbol(s,chrome_filepath,browser,5)
            print(i,s)
    browser.quit()

In [45]:
com_data = combine_data(dic)

In [53]:
check_download_file(com_data,'/Users/lou/Downloads/chromedriver')

17 NFTY
20 CHIC
53 KALL


In [46]:
com_data

Unnamed: 0,Date,XCEM,INCO,INXX,SCIN,AFTY,CNHX,YANG,YINN,CHAD,...,CXSE,DGRE,DEM,DGS,EPI,ASHX,ASHR,ASHS,CN,DBEM
0,2015-12-31,17.546646,32.024254,10.041132,15.324635,13.175530,29.900549,2462.235840,17.392210,41.808060,...,47.408096,17.937878,28.133930,31.635876,19.244785,20.627932,27.201328,41.459999,27.023926,17.220577
1,2016-01-04,16.232405,31.565479,9.879485,15.109759,12.157043,29.900549,2704.211182,15.696127,45.468994,...,47.408096,17.658089,27.306986,30.886930,18.895937,18.760775,24.887564,37.349998,25.167719,16.834337
2,2016-01-05,16.232405,31.904572,10.088675,15.578582,12.444071,27.508121,2742.470215,15.451028,44.496712,...,47.408096,17.598799,27.413689,30.904980,19.128500,19.255548,25.237545,37.500000,25.465759,16.834337
3,2016-01-06,16.232405,31.336094,9.888993,15.324635,12.351482,27.914816,2920.349365,14.451023,45.766632,...,47.408096,17.213394,26.880177,30.471861,18.866867,19.466434,25.033388,37.570000,25.211292,16.608248
4,2016-01-07,16.232405,30.308844,9.546683,14.592100,11.684834,27.914816,3258.965820,12.774549,47.135761,...,44.092236,16.815020,25.982094,29.623669,18.207932,18.120007,23.458471,34.820000,23.877960,16.118385
5,2016-01-08,16.232405,30.169218,9.546683,14.748375,11.805202,26.444611,3354.115967,12.401999,47.016708,...,43.115849,16.694584,25.742016,29.389059,18.256386,18.168673,23.584856,34.450001,23.712383,16.024181
6,2016-01-11,16.232405,30.229057,9.651278,14.943719,11.638539,25.270355,3420.696777,12.166703,48.108040,...,42.361801,16.481501,25.706446,29.280775,18.479254,17.706345,23.011274,32.779999,23.250511,16.024181
7,2016-01-12,15.414730,30.258978,9.584717,14.943719,11.721871,25.270355,3432.870117,12.088271,47.135761,...,42.361801,16.916927,25.813148,29.316875,18.343594,17.955353,23.516804,33.259998,23.455303,16.071281
8,2016-01-13,15.414730,29.929859,9.356509,14.142812,11.397807,25.270355,3619.196045,11.441212,49.437485,...,41.201736,16.596380,25.590853,29.010080,18.140099,17.010420,22.311312,30.959999,23.455303,15.958238
9,2016-01-14,15.441051,30.089434,9.280440,14.103743,11.786683,25.270355,3466.656982,11.921604,47.403629,...,41.868774,16.638994,26.044338,29.361990,18.227310,17.438681,23.283484,32.720001,23.442232,16.099545


In [54]:
dic['name'][17]

'First Trust India NIFTY 50 Equal Weight ETF'

In [55]:
dic['name'][20]

'Global X MSCI China Communication Services ETF'

In [56]:
dic['name'][53]

'KraneShares MSCI China All Shares Index ETF'

In [101]:
def download_globalxfunds(symbol):
    download_link = "https://www.globalxfunds.com/funds/"+symbol+"/?download_chart_data=true"
    file_name = "./data/"+symbol+".csv"
    
    if (os.path.isfile(file_name)):
        os.remove(file_name)
        
    urllib.request.urlretrieve(download_link, file_name)
    temp = pd.read_csv('./data/CHIC.csv',header=1,parse_dates=True).iloc[:806]
    temp['Date'] = pd.to_datetime(temp['Date'], format="%m/%d/%Y")
    
    if (os.path.isfile(file_name)):
        os.remove(file_name)
    temp_NAV = temp[['Date','Closing Price']]
    temp_NAV = temp_NAV.rename(columns={'Closing Price':'Adj Close'})
    temp_NAV.to_csv(file_name,index=False)

In [102]:
download_globalxfunds('CHIC')

In [103]:
com_data = combine_data(dic)

In [104]:
check_download_file(com_data,'/Users/lou/Downloads/chromedriver')

17 NFTY
53 KALL
